Skip to content

Commit 4c5b89f

Browse files
ofaalandbehlendorf
authored andcommitted
Improved dnode allocation and dmu_hold_impl()
Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the code, fix errors introduced by commit dbeb879 (PR #6117) interacting badly with large dnodes, and improve performance. * When allocating a new dnode in dmu_object_alloc_dnsize(), update the percpu object ID for the core's metadnode chunk immediately. This eliminates most lock contention when taking the hold and creating the dnode. * Correct detection of the chunk boundary to work properly with large dnodes. * Separate the dmu_hold_impl() code for the FREE case from the code for the ALLOCATED case to make it easier to read. * Fully populate the dnode handle array immediately after reading a block of the metadnode from disk. Subsequently the dnode handle array provides enough information to determine which dnode slots are in use and which are free. * Add several kstats to allow the behavior of the code to be examined. * Verify dnode packing in large_dnode_008_pos.ksh. Since the test is purely creates, it should leave very few holes in the metadnode. * Add test large_dnode_009_pos.ksh, which performs concurrent creates and deletes, to complement existing test which does only creates. With the above fixes, there is very little contention in a test of about 200,000 racing dnode allocations produced by tests 'large_dnode_008_pos' and 'large_dnode_009_pos'. name type data dnode_hold_dbuf_hold 4 0 dnode_hold_dbuf_read 4 0 dnode_hold_alloc_hits 4 3804690 dnode_hold_alloc_misses 4 216 dnode_hold_alloc_interior 4 3 dnode_hold_alloc_lock_retry 4 0 dnode_hold_alloc_lock_misses 4 0 dnode_hold_alloc_type_none 4 0 dnode_hold_free_hits 4 203105 dnode_hold_free_misses 4 4 dnode_hold_free_lock_misses 4 0 dnode_hold_free_lock_retry 4 0 dnode_hold_free_overflow 4 0 dnode_hold_free_refcount 4 57 dnode_hold_free_txg 4 0 dnode_allocate 4 203154 dnode_reallocate 4 0 dnode_buf_evict 4 23918 dnode_alloc_next_chunk 4 4887 dnode_alloc_race 4 0 dnode_alloc_next_block 4 18 The performance is slightly improved for concurrent creates with 16+ threads, and unchanged for low thread counts. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Olaf Faaland <faaland1@llnl.gov> Closes #5396 Closes #6522 Closes #6414 Closes #6564
1 parent 65dcb0f commit 4c5b89f

File tree

9 files changed

+609
-251
lines changed

9 files changed

+609
-251
lines changed

cmd/zdb/zdb.c

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1934,7 +1934,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
19341934
};
19351935

19361936
static void
1937-
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
1937+
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
1938+
uint64_t *dnode_slots_used)
19381939
{
19391940
dmu_buf_t *db = NULL;
19401941
dmu_object_info_t doi;
@@ -1984,6 +1985,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
19841985
}
19851986
}
19861987

1988+
if (dnode_slots_used)
1989+
*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
1990+
19871991
zdb_nicenum(doi.doi_metadata_block_size, iblk);
19881992
zdb_nicenum(doi.doi_data_block_size, dblk);
19891993
zdb_nicenum(doi.doi_max_offset, lsize);
@@ -2104,6 +2108,9 @@ dump_dir(objset_t *os)
21042108
int verbosity = dump_opt['d'];
21052109
int print_header = 1;
21062110
int i, error;
2111+
uint64_t total_slots_used = 0;
2112+
uint64_t max_slot_used = 0;
2113+
uint64_t dnode_slots;
21072114

21082115
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
21092116
dmu_objset_fast_stat(os, &dds);
@@ -2144,7 +2151,7 @@ dump_dir(objset_t *os)
21442151
if (zopt_objects != 0) {
21452152
for (i = 0; i < zopt_objects; i++)
21462153
dump_object(os, zopt_object[i], verbosity,
2147-
&print_header);
2154+
&print_header, NULL);
21482155
(void) printf("\n");
21492156
return;
21502157
}
@@ -2161,24 +2168,39 @@ dump_dir(objset_t *os)
21612168
if (BP_IS_HOLE(os->os_rootbp))
21622169
return;
21632170

2164-
dump_object(os, 0, verbosity, &print_header);
2171+
dump_object(os, 0, verbosity, &print_header, NULL);
21652172
object_count = 0;
21662173
if (DMU_USERUSED_DNODE(os) != NULL &&
21672174
DMU_USERUSED_DNODE(os)->dn_type != 0) {
2168-
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
2169-
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
2175+
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
2176+
NULL);
2177+
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
2178+
NULL);
21702179
}
21712180

21722181
object = 0;
21732182
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
2174-
dump_object(os, object, verbosity, &print_header);
2183+
dump_object(os, object, verbosity, &print_header, &dnode_slots);
21752184
object_count++;
2185+
total_slots_used += dnode_slots;
2186+
max_slot_used = object + dnode_slots - 1;
21762187
}
21772188

21782189
ASSERT3U(object_count, ==, usedobjs);
21792190

21802191
(void) printf("\n");
21812192

2193+
(void) printf(" Dnode slots:\n");
2194+
(void) printf("\tTotal used: %10llu\n",
2195+
(u_longlong_t)total_slots_used);
2196+
(void) printf("\tMax used: %10llu\n",
2197+
(u_longlong_t)max_slot_used);
2198+
(void) printf("\tPercent empty: %10lf\n",
2199+
(double)(max_slot_used - total_slots_used)*100 /
2200+
(double)max_slot_used);
2201+
2202+
(void) printf("\n");
2203+
21822204
if (error != ESRCH) {
21832205
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
21842206
abort();
@@ -2642,7 +2664,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
26422664
return (dump_path_impl(os, child_obj, s + 1));
26432665
/*FALLTHROUGH*/
26442666
case DMU_OT_PLAIN_FILE_CONTENTS:
2645-
dump_object(os, child_obj, dump_opt['v'], &header);
2667+
dump_object(os, child_obj, dump_opt['v'], &header, NULL);
26462668
return (0);
26472669
default:
26482670
(void) fprintf(stderr, "object %llu has non-file/directory "

include/sys/dnode.h

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ extern "C" {
9898
#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
9999
#define DN_KILL_SPILLBLK (1)
100100

101+
#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
102+
#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
103+
#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
104+
#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
105+
#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
106+
#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
107+
101108
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
102109
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
103110

@@ -419,6 +426,135 @@ void dnode_evict_bonus(dnode_t *dn);
419426
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
420427
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
421428

429+
/*
430+
* Used for dnodestats kstat.
431+
*/
432+
typedef struct dnode_stats {
433+
/*
434+
* Number of failed attempts to hold a meta dnode dbuf.
435+
*/
436+
kstat_named_t dnode_hold_dbuf_hold;
437+
/*
438+
* Number of failed attempts to read a meta dnode dbuf.
439+
*/
440+
kstat_named_t dnode_hold_dbuf_read;
441+
/*
442+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
443+
* to hold the requested object number which was allocated. This is
444+
* the common case when looking up any allocated object number.
445+
*/
446+
kstat_named_t dnode_hold_alloc_hits;
447+
/*
448+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
449+
* able to hold the request object number because it was not allocated.
450+
*/
451+
kstat_named_t dnode_hold_alloc_misses;
452+
/*
453+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
454+
* able to hold the request object number because the object number
455+
* refers to an interior large dnode slot.
456+
*/
457+
kstat_named_t dnode_hold_alloc_interior;
458+
/*
459+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
460+
* to retry acquiring slot zrl locks due to contention.
461+
*/
462+
kstat_named_t dnode_hold_alloc_lock_retry;
463+
/*
464+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
465+
* need to create the dnode because another thread did so after
466+
* dropping the read lock but before acquiring the write lock.
467+
*/
468+
kstat_named_t dnode_hold_alloc_lock_misses;
469+
/*
470+
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
471+
* a free dnode instantiated by dnode_create() but not yet allocated
472+
* by dnode_allocate().
473+
*/
474+
kstat_named_t dnode_hold_alloc_type_none;
475+
/*
476+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
477+
* to hold the requested range of free dnode slots.
478+
*/
479+
kstat_named_t dnode_hold_free_hits;
480+
/*
481+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
482+
* able to hold the requested range of free dnode slots because
483+
* at least one slot was allocated.
484+
*/
485+
kstat_named_t dnode_hold_free_misses;
486+
/*
487+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
488+
* able to hold the requested range of free dnode slots because
489+
* after acquiring the zrl lock at least one slot was allocated.
490+
*/
491+
kstat_named_t dnode_hold_free_lock_misses;
492+
/*
493+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
494+
* to retry acquiring slot zrl locks due to contention.
495+
*/
496+
kstat_named_t dnode_hold_free_lock_retry;
497+
/*
498+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
499+
* a range of dnode slots which were held by another thread.
500+
*/
501+
kstat_named_t dnode_hold_free_refcount;
502+
/*
503+
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
504+
* a range of dnode slots which would overflow the dnode_phys_t.
505+
*/
506+
kstat_named_t dnode_hold_free_overflow;
507+
/*
508+
* Number of times a dnode_hold(...) was attempted on a dnode
509+
* which had already been unlinked in an earlier txg.
510+
*/
511+
kstat_named_t dnode_hold_free_txg;
512+
/*
513+
* Number of new dnodes allocated by dnode_allocate().
514+
*/
515+
kstat_named_t dnode_allocate;
516+
/*
517+
* Number of dnodes re-allocated by dnode_reallocate().
518+
*/
519+
kstat_named_t dnode_reallocate;
520+
/*
521+
* Number of meta dnode dbufs evicted.
522+
*/
523+
kstat_named_t dnode_buf_evict;
524+
/*
525+
* Number of times dmu_object_alloc*() reached the end of the existing
526+
* object ID chunk and advanced to a new one.
527+
*/
528+
kstat_named_t dnode_alloc_next_chunk;
529+
/*
530+
* Number of times multiple threads attempted to allocate a dnode
531+
* from the same block of free dnodes.
532+
*/
533+
kstat_named_t dnode_alloc_race;
534+
/*
535+
* Number of times dmu_object_alloc*() was forced to advance to the
536+
* next meta dnode dbuf due to an error from dmu_object_next().
537+
*/
538+
kstat_named_t dnode_alloc_next_block;
539+
/*
540+
* Statistics for tracking dnodes which have been moved.
541+
*/
542+
kstat_named_t dnode_move_invalid;
543+
kstat_named_t dnode_move_recheck1;
544+
kstat_named_t dnode_move_recheck2;
545+
kstat_named_t dnode_move_special;
546+
kstat_named_t dnode_move_handle;
547+
kstat_named_t dnode_move_rwlock;
548+
kstat_named_t dnode_move_active;
549+
} dnode_stats_t;
550+
551+
extern dnode_stats_t dnode_stats;
552+
553+
#define DNODE_STAT_INCR(stat, val) \
554+
atomic_add_64(&dnode_stats.stat.value.ui64, (val));
555+
#define DNODE_STAT_BUMP(stat) \
556+
DNODE_STAT_INCR(stat, 1);
557+
422558
#ifdef ZFS_DEBUG
423559

424560
/*

module/zfs/dbuf_stats.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
7272
if (db->db_buf)
7373
arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
7474

75-
if (dn)
76-
__dmu_object_info_from_dnode(dn, &doi);
75+
__dmu_object_info_from_dnode(dn, &doi);
7776

7877
nwritten = snprintf(buf, size,
7978
"%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "

module/zfs/dmu_object.c

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
9393
* If we finished a chunk of dnodes, get a new one from
9494
* the global allocator.
9595
*/
96-
if (P2PHASE(object, dnodes_per_chunk) == 0) {
96+
if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
97+
(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
98+
dn_slots)) {
99+
DNODE_STAT_BUMP(dnode_alloc_next_chunk);
97100
mutex_enter(&os->os_obj_lock);
98101
ASSERT0(P2PHASE(os->os_obj_next_chunk,
99102
dnodes_per_chunk));
@@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
157160
mutex_exit(&os->os_obj_lock);
158161
}
159162

163+
/*
164+
* The value of (*cpuobj) before adding dn_slots is the object
165+
* ID assigned to us. The value afterwards is the object ID
166+
* assigned to whoever wants to do an allocation next.
167+
*/
168+
object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
169+
160170
/*
161171
* XXX We should check for an i/o error here and return
162172
* up to our caller. Actually we should pre-read it in
@@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
177187
rw_exit(&dn->dn_struct_rwlock);
178188
dmu_tx_add_new_object(tx, dn);
179189
dnode_rele(dn, FTAG);
180-
181-
(void) atomic_swap_64(cpuobj,
182-
object + dn_slots);
183190
return (object);
184191
}
185192
rw_exit(&dn->dn_struct_rwlock);
186193
dnode_rele(dn, FTAG);
194+
DNODE_STAT_BUMP(dnode_alloc_race);
187195
}
188196

197+
/*
198+
* Skip to next known valid starting point on error. This
199+
* is the start of the next block of dnodes.
200+
*/
189201
if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
190-
/*
191-
* Skip to next known valid starting point for a
192-
* dnode.
193-
*/
194202
object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
203+
DNODE_STAT_BUMP(dnode_alloc_next_block);
195204
}
196205
(void) atomic_swap_64(cpuobj, object);
197206
}
@@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
304313
if (*objectp == 0) {
305314
start_obj = 1;
306315
} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
316+
uint64_t i = *objectp + 1;
317+
uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
318+
dmu_object_info_t doi;
319+
307320
/*
308-
* For large_dnode datasets, scan from the beginning of the
309-
* dnode block to find the starting offset. This is needed
310-
* because objectp could be part of a large dnode so we can't
311-
* assume it's a hole even if dmu_object_info() returns ENOENT.
321+
* Scan through the remaining meta dnode block. The contents
322+
* of each slot in the block are known so it can be quickly
323+
* checked. If the block is exhausted without a match then
324+
* hand off to dnode_next_offset() for further scanning.
312325
*/
313-
int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
314-
int skip;
315-
uint64_t i;
316-
317-
for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
318-
dmu_object_info_t doi;
319-
326+
while (i <= last_obj) {
320327
error = dmu_object_info(os, i, &doi);
321-
if (error != 0)
322-
skip = 1;
323-
else
324-
skip = doi.doi_dnodesize >> DNODE_SHIFT;
328+
if (error == ENOENT) {
329+
if (hole) {
330+
*objectp = i;
331+
return (0);
332+
} else {
333+
i++;
334+
}
335+
} else if (error == EEXIST) {
336+
i++;
337+
} else if (error == 0) {
338+
if (hole) {
339+
i += doi.doi_dnodesize >> DNODE_SHIFT;
340+
} else {
341+
*objectp = i;
342+
return (0);
343+
}
344+
} else {
345+
return (error);
346+
}
325347
}
326348

327349
start_obj = i;

0 commit comments

Comments
 (0)