diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 06ce396ef85d..ed1349d665e9 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1672,6 +1672,7 @@ typedef struct mdb_metaslab_alloc_trace { uint64_t mat_weight; uint64_t mat_offset; uint32_t mat_dva_id; + int mat_allocator; } mdb_metaslab_alloc_trace_t; static void @@ -1744,8 +1745,9 @@ metaslab_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } if (!(flags & DCMD_PIPE_OUT) && DCMD_HDRSPEC(flags)) { - mdb_printf("%%6s %6s %8s %11s %18s %18s%\n", - "MSID", "DVA", "ASIZE", "WEIGHT", "RESULT", "VDEV"); + mdb_printf("%%6s %6s %8s %11s %11s %18s %18s%\n", + "MSID", "DVA", "ASIZE", "ALLOCATOR", "WEIGHT", "RESULT", + "VDEV"); } if (mat.mat_msp != NULL) { @@ -1760,7 +1762,8 @@ metaslab_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("%6s ", "-"); } - mdb_printf("%6d %8llx ", mat.mat_dva_id, mat.mat_size); + mdb_printf("%6d %8llx %11llx ", mat.mat_dva_id, mat.mat_size, + mat.mat_allocator); metaslab_print_weight(mat.mat_weight); diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh index 7b5870b8d622..1e86a66b7cc3 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh @@ -12,7 +12,7 @@ # # -# Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright (c) 2016, 2018 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -49,6 +49,7 @@ function custom_cleanup set_vdev_validate_skip 0 cleanup log_must mdb_ctf_set_int vdev_min_ms_count 0t16 + log_must mdb_ctf_set_int spa_allocators 0t4 } log_onexit custom_cleanup @@ -207,6 +208,10 @@ increase_device_sizes $(( FILE_SIZE * 4 )) # reduce the chance of reusing a metaslab that holds old MOS metadata. log_must mdb_ctf_set_int vdev_min_ms_count 0t150 +# Decrease the number of allocators for pools created during this test, +# to increase the odds that metadata survives from old txgs. +log_must mdb_ctf_set_int spa_allocators 0t1 + # Part of the rewind test is to see how it reacts to path changes typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3" diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh index b9dbc6c97ed9..621ac23aa9a8 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2013, 2018 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/slog/slog.kshlib @@ -52,9 +52,19 @@ do log_must zpool create $TESTPOOL $type $VDEV $spare $SDEV \ log $LDEV + # Create a file to be corrupted + dd if=/dev/urandom of=/$TESTPOOL/filler bs=1024k count=50 + + # + # Ensure the file has been synced out before attempting to + # corrupt its contents. + # + sync + + # # Corrupt a pool device to make the pool DEGRADED - dd if=/dev/urandom of=/$TESTPOOL/filler bs=1024k count=50 # The oseek value below is to skip past the vdev label. + # log_must dd if=/dev/urandom of=$VDIR/a bs=1024k oseek=4 \ conv=notrunc count=50 log_must zpool scrub $TESTPOOL diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 25f5cc964987..ce0dad59cd9d 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -212,6 +212,8 @@ static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); +static void metaslab_passivate(metaslab_t *msp, uint64_t weight); +static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; @@ -231,7 +233,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - refcount_create_tracked(&mc->mc_alloc_slots); + mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (refcount_t), KM_SLEEP); + mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) + refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } @@ -245,7 +252,12 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - refcount_destroy(&mc->mc_alloc_slots); + for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) + refcount_destroy(&mc->mc_alloc_slots[i]); + kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * + sizeof (refcount_t)); + kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * + sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -442,6 +454,30 @@ metaslab_compare(const void *x1, const void *x2) const metaslab_t *m1 = x1; const metaslab_t *m2 = x2; + int sort1 = 0; + int sort2 = 0; + if (m1->ms_allocator != -1 && m1->ms_primary) + sort1 = 1; + else if (m1->ms_allocator != -1 && !m1->ms_primary) + sort1 = 2; + if (m2->ms_allocator != -1 && m2->ms_primary) + sort2 = 1; + else if (m2->ms_allocator != -1 && !m2->ms_primary) + sort2 = 2; + + /* + * Sort inactive metaslabs first, then primaries, then secondaries. When + * selecting a metaslab to allocate from, an allocator first tries its + * primary, then secondary active metaslab. If it doesn't have active + * metaslabs, or can't allocate from them, it searches for an inactive + * metaslab to activate. If it can't find a suitable one, it will steal + * a primary or secondary metaslab from another allocator. + */ + if (sort1 < sort2) + return (-1); + if (sort1 > sort2) + return (1); + if (m1->ms_weight < m2->ms_weight) return (1); if (m1->ms_weight > m2->ms_weight) @@ -593,12 +629,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg) } metaslab_group_t * -metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); + mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); mg->mg_vd = vd; @@ -606,7 +646,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; - refcount_create_tracked(&mg->mg_alloc_queue_depth); + mg->mg_allocators = allocators; + + mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), + KM_SLEEP); + mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < allocators; i++) { + refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); @@ -628,8 +677,20 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); + kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); + kmem_free(mg->mg_secondaries, mg->mg_allocators * + sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); - refcount_destroy(&mg->mg_alloc_queue_depth); + + for (int i = 0; i < mg->mg_allocators; i++) { + refcount_destroy(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } + kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * + sizeof (refcount_t)); + kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * + sizeof (uint64_t)); + kmem_free(mg, sizeof (metaslab_group_t)); } @@ -708,6 +769,22 @@ metaslab_group_passivate(metaslab_group_t *mg) taskq_wait(mg->mg_taskq); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_t *msp = mg->mg_primaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + msp = mg->mg_secondaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + } mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -847,6 +924,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) mutex_exit(&mg->mg_lock); } +static void +metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + ASSERT(MUTEX_HELD(&mg->mg_lock)); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + +} + static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { @@ -858,10 +946,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_weight = weight; - avl_add(&mg->mg_metaslab_tree, msp); + metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } @@ -909,7 +994,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg) */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - uint64_t psize) + uint64_t psize, int allocator) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; @@ -938,7 +1023,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; - uint64_t qmax = mg->mg_max_alloc_queue_depth; + uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); @@ -950,7 +1035,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_no_free_space) return (B_FALSE); - qdepth = refcount_count(&mg->mg_alloc_queue_depth); + qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's @@ -969,9 +1054,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { - qmax = mgp->mg_max_alloc_queue_depth; + qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; - qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + qdepth = refcount_count( + &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that @@ -1458,6 +1544,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; + ms->ms_allocator = -1; + ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others @@ -1553,6 +1641,7 @@ metaslab_fini(metaslab_t *msp) cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); + ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } @@ -1949,19 +2038,59 @@ metaslab_weight(metaslab_t *msp) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, + int allocator, uint64_t activation_weight) +{ + /* + * If we're activating for the claim code, we don't want to actually + * set the metaslab up for a specific allocator. + */ + if (activation_weight == METASLAB_WEIGHT_CLAIM) + return (0); + metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + mg->mg_primaries : mg->mg_secondaries); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&mg->mg_lock); + if (arr[allocator] != NULL) { + mutex_exit(&mg->mg_lock); + return (EEXIST); + } + + arr[allocator] = msp; + ASSERT3S(msp->ms_allocator, ==, -1); + msp->ms_allocator = allocator; + msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + mutex_exit(&mg->mg_lock); + + return (0); +} + +static int +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = 0; metaslab_load_wait(msp); if (!msp->ms_loaded) { - int error = metaslab_load(msp); - if (error) { + if ((error = metaslab_load(msp)) != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } } + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + /* + * The metaslab was activated for another allocator + * while we were waiting, we should reselect. + */ + return (EBUSY); + } + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, @@ -1973,6 +2102,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) return (0); } +static void +metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_group_sort(mg, msp, weight); + return; + } + + mutex_enter(&mg->mg_lock); + ASSERT3P(msp->ms_group, ==, mg); + if (msp->ms_primary) { + ASSERT3U(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + mg->mg_primaries[msp->ms_allocator] = NULL; + } else { + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); + mg->mg_secondaries[msp->ms_allocator] = NULL; + } + msp->ms_allocator = -1; + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { @@ -1988,7 +2145,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; - metaslab_group_sort(msp->ms_group, msp, weight); + metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } @@ -2566,11 +2723,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + if (msp->ms_new) { + msp->ms_new = B_FALSE; + mutex_enter(&mg->mg_lock); + mg->mg_ms_ready++; + mutex_exit(&mg->mg_lock); + } /* * Calculate the new weights before unloading any metaslabs. * This will give us the most accurate weighting. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp)); + metaslab_group_sort(mg, msp, metaslab_weight(msp) | + (msp->ms_weight & METASLAB_ACTIVE_MASK)); /* * If the metaslab is loaded and we've not tried to load or allocate @@ -2582,6 +2746,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } if (!metaslab_debug_unload) metaslab_unload(msp); @@ -2675,7 +2843,8 @@ metaslab_alloc_trace_fini(void) */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, - metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, + int allocator) { if (!metaslab_trace_enabled) return; @@ -2708,6 +2877,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; + mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; @@ -2748,35 +2918,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal) */ static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_add(&mg->mg_alloc_queue_depth, tag); + (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); +} + +static void +metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) +{ + uint64_t max = mg->mg_max_alloc_queue_depth; + uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + while (cur < max) { + if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], + cur, cur + 1) == cur) { + atomic_inc_64( + &mg->mg_class->mc_alloc_max_slots[allocator]); + return; + } + cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + } } void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); + (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); + if (io_complete) + metaslab_group_increment_qdepth(mg, allocator); } void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, + int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; @@ -2785,7 +2976,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], + tag)); } #endif } @@ -2827,91 +3019,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) return (start); } +/* + * Find the metaslab with the highest weight that is less than what we've + * already tried. In the common case, this means that we will examine each + * metaslab at most once. Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. If a metaslab is + * activated by another thread, and we fail to allocate from the metaslab we + * have selected, we may not try the newly-activated metaslab, and instead + * activate another metaslab. This is not optimal, but generally does not cause + * any problems (a possible exception being if every metaslab is completely full + * except for the the newly-activated metaslab which we fail to examine). + */ +static metaslab_t * +find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, + dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) +{ + avl_index_t idx; + avl_tree_t *t = &mg->mg_metaslab_tree; + metaslab_t *msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + int i; + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + continue; + } + + /* + * If the selected metaslab is condensing, skip it. + */ + if (msp->ms_condensing) + continue; + + *was_active = msp->ms_allocator != -1; + /* + * If we're activating as primary, this is our first allocation + * from this disk, so we don't need to check how close we are. + * If the metaslab under consideration was already active, + * we're getting desperate enough to steal another allocator's + * metaslab, so we still don't care about distances. + */ + if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) + break; + + uint64_t target_distance = min_distance + + (space_map_allocated(msp->ms_sm) != 0 ? 0 : + min_distance >> 1); + + for (i = 0; i < d; i++) { + if (metaslab_distance(msp, &dva[i]) < target_distance) + break; + } + if (i == d) + break; + } + + if (msp != NULL) { + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; + search->ms_allocator = msp->ms_allocator; + search->ms_primary = msp->ms_primary; + } + return (msp); +} + +/* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight; - uint64_t target_distance; - int i; + boolean_t tertiary = B_FALSE; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) { - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + for (int i = 0; i < d; i++) { + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + tertiary = B_TRUE; break; } } + /* + * If we don't have enough metaslabs active to fill the entire array, we + * just use the 0th slot. + */ + if (mg->mg_ms_ready < mg->mg_allocators * 2) { + tertiary = B_FALSE; + allocator = 0; + } + + ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; + /* + * At the end of the metaslab tree are the already-active metaslabs, + * first the primaries, then the secondaries. When we resume searching + * through the tree, we need to consider ms_allocator and ms_primary so + * we start in the location right after where we left off, and don't + * accidentally loop forever considering the same metaslabs. + */ + search->ms_allocator = -1; + search->ms_primary = B_TRUE; for (;;) { - boolean_t was_active; - avl_tree_t *t = &mg->mg_metaslab_tree; - avl_index_t idx; + boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); - /* - * Find the metaslab with the highest weight that is less - * than what we've already tried. In the common case, this - * means that we will examine each metaslab at most once. - * Note that concurrent callers could reorder metaslabs - * by activation/passivation once we have dropped the mg_lock. - * If a metaslab is activated by another thread, and we fail - * to allocate from the metaslab we have selected, we may - * not try the newly-activated metaslab, and instead activate - * another metaslab. This is not optimal, but generally - * does not cause any problems (a possible exception being - * if every metaslab is completely full except for the - * the newly-activated metaslab which we fail to examine). - */ - msp = avl_find(t, search, &idx); - if (msp == NULL) - msp = avl_nearest(t, idx, AVL_AFTER); - for (; msp != NULL; msp = AVL_NEXT(t, msp)) { - - if (!metaslab_should_allocate(msp, asize)) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); - continue; - } - - /* - * If the selected metaslab is condensing, skip it. - */ - if (msp->ms_condensing) - continue; - - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - if (activation_weight == METASLAB_WEIGHT_PRIMARY) - break; - - target_distance = min_distance + - (space_map_allocated(msp->ms_sm) != 0 ? 0 : - min_distance >> 1); - - for (i = 0; i < d; i++) { - if (metaslab_distance(msp, &dva[i]) < - target_distance) - break; - } - if (i == d) - break; + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + mg->mg_primaries[allocator] != NULL) { + msp = mg->mg_primaries[allocator]; + was_active = B_TRUE; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + mg->mg_secondaries[allocator] != NULL && !tertiary) { + msp = mg->mg_secondaries[allocator]; + was_active = B_TRUE; + } else { + msp = find_valid_metaslab(mg, activation_weight, dva, d, + min_distance, asize, allocator, zal, search, + &was_active); } + mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } - search->ms_weight = msp->ms_weight; - search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -2925,18 +3172,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && - activation_weight == METASLAB_WEIGHT_PRIMARY) { - metaslab_passivate(msp, - msp->ms_weight & ~METASLAB_ACTIVE_MASK); + /* + * If the metaslab is freshly activated for an allocator that + * isn't the one we're allocating from, or if it's a primary and + * we're seeking a secondary (or vice versa), we go back and + * select a new metaslab. + */ + if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && + (msp->ms_allocator != -1) && + (msp->ms_allocator != allocator || ((activation_weight == + METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { mutex_exit(&msp->ms_lock); continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } + + if (metaslab_activate(msp, allocator, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + msp->ms_selected_txg = txg; /* @@ -2949,7 +3210,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (!metaslab_should_allocate(msp, asize)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); + TRACE_TOO_SMALL, allocator); goto next; } @@ -2960,13 +3221,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_CONDENSING); + TRACE_CONDENSING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset); + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ @@ -3022,19 +3285,20 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, - min_distance, dva, d); + min_distance, dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, - TRACE_GROUP_FAILURE); + TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate @@ -3069,7 +3333,7 @@ int ditto_same_vdev_distance_shift = 3; int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal) + zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *rotor; vdev_t *vd; @@ -3081,7 +3345,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * For testing, make some blocks above a certain size be gang blocks. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { - metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, + allocator); return (SET_ERROR(ENOSPC)); } @@ -3167,12 +3432,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, - psize); + psize, allocator); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_NOT_ALLOCATABLE); + TRACE_NOT_ALLOCATABLE, allocator); goto next; } @@ -3187,7 +3452,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_VDEV_ERROR); + TRACE_VDEV_ERROR, allocator); goto next; } @@ -3211,7 +3476,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d); + distance, dva, d, allocator); if (offset != -1ULL) { /* @@ -3274,7 +3539,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, bzero(&dva[d], sizeof (dva_t)); - metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } @@ -3575,18 +3840,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) * the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, - int flags) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, + zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; + uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); - if (reserved_slots < mc->mc_alloc_max_slots) - available_slots = mc->mc_alloc_max_slots - reserved_slots; + uint64_t reserved_slots = + refcount_count(&mc->mc_alloc_slots[allocator]); + if (reserved_slots < max) + available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags)) { /* @@ -3594,7 +3861,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { - reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + reserved_slots = + refcount_add(&mc->mc_alloc_slots[allocator], + zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; @@ -3605,12 +3874,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, } void -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, + int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { - (void) refcount_remove(&mc->mc_alloc_slots, zio); + (void) refcount_remove(&mc->mc_alloc_slots[allocator], + zio); } mutex_exit(&mc->mc_lock); } @@ -3632,7 +3903,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); + /* + * No need to fail in that case; someone else has activated the + * metaslab, but that doesn't preclude us from using it. + */ + if (error == EBUSY) + error = 0; if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) @@ -3737,7 +4014,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, - zio_alloc_list_t *zal, zio_t *zio) + zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -3760,12 +4037,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal); + txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, + allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -3776,7 +4054,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 064e4411db80..163f5e054e3e 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -7398,9 +7398,11 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * If there are any pending vdev state changes, convert them @@ -7459,7 +7461,7 @@ spa_sync(spa_t *spa, uint64_t txg) * The max queue depth will not change in the middle of syncing * out this txg. */ - uint64_t queue_depth_total = 0; + uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -7473,18 +7475,23 @@ spa_sync(spa_t *spa, uint64_t txg) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; - queue_depth_total += mg->mg_max_alloc_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); - ASSERT0(refcount_count(&mc->mc_alloc_slots)); - mc->mc_alloc_max_slots = queue_depth_total; + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); + mc->mc_alloc_max_slots[i] = slots_per_allocator; + } mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - ASSERT3U(mc->mc_alloc_max_slots, <=, - max_queue_depth * rvd->vdev_children); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); @@ -7661,9 +7668,11 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * Update usable space statistics. diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index efb4993c0a05..fe0971a7208c 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -357,6 +357,8 @@ int spa_asize_inflation = 24; int spa_slop_shift = 5; uint64_t spa_min_slop = 128 * 1024 * 1024; +int spa_allocators = 4; + /*PRINTFLIKE2*/ void spa_load_failed(spa_t *spa, const char *fmt, ...) @@ -607,7 +609,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -658,8 +659,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_active_count++; } - avl_create(&spa->spa_alloc_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + spa->spa_alloc_count = spa_allocators; + spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * + sizeof (kmutex_t), KM_SLEEP); + spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * + sizeof (avl_tree_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + } /* * Every pool starts with the default cachefile @@ -746,7 +755,15 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } - avl_destroy(&spa->spa_alloc_tree); + for (int i = 0; i < spa->spa_alloc_count; i++) { + avl_destroy(&spa->spa_alloc_trees[i]); + mutex_destroy(&spa->spa_alloc_locks[i]); + } + kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * + sizeof (kmutex_t)); + kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * + sizeof (avl_tree_t)); + list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); @@ -777,7 +794,6 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); - mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index a76d344d21b3..32a19ca64584 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, - dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); @@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); -void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -99,7 +100,7 @@ uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); -metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); @@ -108,8 +109,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); -void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); -void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, + boolean_t); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index a6673fbe06a2..6a02f7c80067 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace { uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; + int mat_allocator; } metaslab_alloc_trace_t; /* @@ -72,9 +73,11 @@ typedef enum trace_alloc_type { #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_WEIGHT_CLAIM (1ULL << 61) +#define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ + METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a @@ -97,37 +100,39 @@ typedef enum trace_alloc_type { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS1| weighted-free space | + * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS0| idx| count of segments in region | + * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ -#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) -#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 61, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + ((weight) == 0 || BF64_GET((weight), 60, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * A metaslab class encompasses a category of allocatable top-level vdevs. @@ -178,8 +183,8 @@ struct metaslab_class { * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ - uint64_t mc_alloc_max_slots; - refcount_t mc_alloc_slots; + uint64_t *mc_alloc_max_slots; + refcount_t *mc_alloc_slots; uint64_t mc_alloc_groups; /* # of allocatable groups */ @@ -201,9 +206,12 @@ struct metaslab_class { */ struct metaslab_group { kmutex_t mg_lock; + metaslab_t **mg_primaries; + metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after @@ -223,15 +231,33 @@ struct metaslab_group { metaslab_group_t *mg_next; /* - * Each metaslab group can handle mg_max_alloc_queue_depth allocations - * which are tracked by mg_alloc_queue_depth. It's possible for a - * metaslab group to handle more allocations than its max. This - * can occur when gang blocks are required or when other groups - * are unable to handle their share of allocations. + * In order for the allocation throttle to function properly, we cannot + * have too many IOs going to each disk by default; the throttle + * operates by allocating more work to disks that finish quickly, so + * allocating larger chunks to each disk reduces its effectiveness. + * However, if the number of IOs going to each allocator is too small, + * we will not perform proper aggregation at the vdev_queue layer, + * also resulting in decreased performance. Therefore, we will use a + * ramp-up strategy. + * + * Each allocator in each metaslab group has a current queue depth + * (mg_alloc_queue_depth[allocator]) and a current max queue depth + * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * has an absolute max queue depth (mg_max_alloc_queue_depth). We + * add IOs to an allocator until the mg_alloc_queue_depth for that + * allocator hits the cur_max. Every time an IO completes for a given + * allocator on a given metaslab group, we increment its cur_max until + * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to + * help protect against disks that decrease in performance over time. + * + * It's possible for an allocator to handle more allocations than + * its max. This can occur when gang blocks are required or when other + * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; - refcount_t mg_alloc_queue_depth; - + uint64_t *mg_cur_max_alloc_queue_depth; + refcount_t *mg_alloc_queue_depth; + int mg_allocators; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -355,6 +381,13 @@ struct metaslab { uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ + /* + * -1 if it's not active in an allocator, otherwise set to the allocator + * this metaslab is active for. + */ + int ms_allocator; + boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ + /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use @@ -369,6 +402,8 @@ struct metaslab { metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + + boolean_t ms_new; }; #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index a23f9b8c4f79..a6da04b67b59 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -237,8 +237,16 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ - kmutex_t spa_alloc_lock; - avl_tree_t spa_alloc_tree; + /* + * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are + * stored in spa_alloc_count. There is one tree and one lock for each + * allocator, to help improve allocation performance in write-heavy + * workloads. + */ + kmutex_t *spa_alloc_locks; + avl_tree_t *spa_alloc_trees; + int spa_alloc_count; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 6f321323bfc5..f75233af6028 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern int zfs_vdev_queue_depth_pct; +extern int zfs_vdev_def_queue_depth; extern uint32_t zfs_vdev_async_write_max_active; /* diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 886e9ca51d3d..7009cb99373c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome @@ -458,6 +458,7 @@ struct zio { void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; + int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; @@ -515,8 +516,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); -extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t size, boolean_t *slog); +extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, + blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index e23e4a01c1e3..1a8c1464b725 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -644,7 +644,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd); + spa_log_class(spa) : spa_normal_class(spa), vd, + spa->spa_alloc_count); } if (vd->vdev_ops->vdev_op_leaf && @@ -1013,7 +1014,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { uint64_t object = 0; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index cb725c858cb1..f0cca0538d75 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -187,6 +187,15 @@ int zfs_vdev_queue_depth_pct = 1000; int zfs_vdev_queue_depth_pct = 300; #endif +/* + * When performing allocations for a given metaslab, we want to make sure that + * there are enough IOs to aggregate together to improve throughput. We want to + * ensure that there are at least 128k worth of IOs that can be aggregated, and + * we assume that the average allocation size is 4k, so we need the queue depth + * to be 32 per allocator to get good aggregation of sequential writes. + */ +int zfs_vdev_def_queue_depth = 32; + int vdev_queue_offset_compare(const void *x1, const void *x2) diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c index f8d149fc11ea..d00b5b35f743 100644 --- a/usr/src/uts/common/fs/zfs/vdev_removal.c +++ b/usr/src/uts/common/fs/zfs/vdev_removal.c @@ -806,8 +806,15 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + /* + * We use allocator 0 for this I/O because we don't expect device remap + * to be the steady state of the system, so parallelizing is not as + * critical as it is for other allocation types. We also want to ensure + * that the IOs are allocated together as much as possible, to reduce + * mapping sizes. + */ int error = metaslab_alloc_dva(spa, mg->mg_class, size, - &dst, 0, NULL, txg, 0, zal); + &dst, 0, NULL, txg, 0, zal, 0); if (error != 0) return (error); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index cd0a7dc108ce..d62dc97fc6fd 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -656,7 +656,8 @@ zil_create(zilog_t *zilog) BP_ZERO(&blk); } - error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, + error = zio_alloc_zil(zilog->zl_spa, + zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, ZIL_MIN_BLKSZ, &slog); if (error == 0) @@ -1333,7 +1334,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog); + error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, + txg, bp, &lwb->lwb_blk, zil_blksz, &slog); if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index e2174b3cf4fa..b3226a053fb8 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -42,6 +42,7 @@ #include #include #include +#include /* * ========================================================================== @@ -2230,7 +2231,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); /* * The logical zio has already placed a reservation for @@ -2241,12 +2243,12 @@ zio_write_gang_block(zio_t *pio) * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio, flags)); + pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio); + &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2260,7 +2262,7 @@ zio_write_gang_block(zio_t *pio) * stage. */ metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio); + gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); @@ -2318,7 +2320,7 @@ zio_write_gang_block(zio_t *pio) * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio, flags)); + zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } @@ -2808,13 +2810,13 @@ zio_ddt_free(zio_t *zio) */ static zio_t * -zio_io_to_allocate(spa_t *spa) +zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); - zio = avl_first(&spa->spa_alloc_tree); + zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); @@ -2824,12 +2826,13 @@ zio_io_to_allocate(spa_t *spa) * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ + ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(spa_normal_class(spa), - zio->io_prop.zp_copies, zio, 0)) { + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_tree, zio); + avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); @@ -2853,13 +2856,23 @@ zio_dva_throttle(zio_t *zio) ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - mutex_enter(&spa->spa_alloc_lock); + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); - avl_add(&spa->spa_alloc_tree, zio); + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(zio->io_spa); - mutex_exit(&spa->spa_alloc_lock); + nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); if (nio == zio) return (ZIO_PIPELINE_CONTINUE); @@ -2880,13 +2893,13 @@ zio_dva_throttle(zio_t *zio) } void -zio_allocate_dispatch(spa_t *spa) +zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_lock); - zio = zio_io_to_allocate(spa); - mutex_exit(&spa->spa_alloc_lock); + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; @@ -2927,7 +2940,7 @@ zio_dva_allocate(zio_t *zio) error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio); + &zio->io_alloc_list, zio, zio->io_allocator); if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " @@ -2987,8 +3000,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t *slog) +zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; @@ -2996,14 +3009,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); + txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, + cityhash4(0, 0, 0, objset) % spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, - &io_alloc_list, NULL); + &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) % + spa->spa_alloc_count); if (error == 0) *slog = FALSE; } @@ -3493,8 +3514,8 @@ zio_ready(zio_t *zio) */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio); - zio_allocate_dispatch(zio->io_spa); + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -3577,18 +3598,19 @@ zio_dva_throttle_done(zio_t *zio) ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio); + 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ - zio_allocate_dispatch(zio->io_spa); + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static int @@ -3631,8 +3653,10 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(bp != NULL); - metaslab_group_alloc_verify(spa, zio->io_bp, zio); - VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); + metaslab_group_alloc_verify(spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++)