openzfs · pcd1193182 · Feb 12, 2018 · Apr 18, 2018 · Jul 30, 2018 · Jul 31, 2018
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -66,9 +66,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
 #define	METASLAB_FASTWRITE		0x20
 
 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
-    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
+	int);
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
-    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
+    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
 void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
 void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
@@ -91,9 +92,9 @@ int metaslab_class_validate(metaslab_class_t *);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
     zio_t *, int);
-void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
 
 void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
     int64_t, int64_t);
@@ -102,7 +103,7 @@ uint64_t metaslab_class_get_space(metaslab_class_t *);
 uint64_t metaslab_class_get_dspace(metaslab_class_t *);
 uint64_t metaslab_class_get_deferred(metaslab_class_t *);
 
-metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
 void metaslab_group_passivate(metaslab_group_t *);
@@ -111,8 +112,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
 void metaslab_group_histogram_verify(metaslab_group_t *);
 uint64_t metaslab_group_fragmentation(metaslab_group_t *);
 void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
-void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
-void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
+    boolean_t);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
 
 #ifdef	__cplusplus
 }

diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace {
 	uint64_t			mat_weight;
 	uint32_t			mat_dva_id;
 	uint64_t			mat_offset;
+	int					mat_allocator;
 } metaslab_alloc_trace_t;
 
 /*
@@ -72,9 +73,11 @@ typedef enum trace_alloc_type {
 
 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_WEIGHT_TYPE		(1ULL << 61)
+#define	METASLAB_WEIGHT_CLAIM		(1ULL << 61)
+#define	METASLAB_WEIGHT_TYPE		(1ULL << 60)
 #define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
+	METASLAB_WEIGHT_CLAIM)
 
 /*
  * The metaslab weight is used to encode the amount of free space in a
@@ -97,37 +100,39 @@ typedef enum trace_alloc_type {
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS1|                   weighted-free space                     |
+ *      |PSC1|                  weighted-free space                     |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
  *	space - the fragmentation-weighted space
  *
  * Segment-based weight:
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS0| idx|             count of segments in region              |
+ *      |PSC0| idx|            count of segments in region              |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
  *	idx - index for the highest bucket in the histogram
  *	count - number of segments in the specified bucket
  */
-#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 62, 2)
-#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 62, 2, x)
+#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 61, 3)
+#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 61, 3, x)
 
 #define	WEIGHT_IS_SPACEBASED(weight)		\
-	((weight) == 0 || BF64_GET((weight), 61, 1))
-#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 61, 1, 1)
+	((weight) == 0 || BF64_GET((weight), 60, 1))
+#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 60, 1, 1)
 
 /*
  * These macros are only applicable to segment-based weighting.
  */
-#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 55, 6)
-#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 55, 6, x)
-#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 55)
-#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 55, x)
+#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 54, 6)
+#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 54, 6, x)
+#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 54)
+#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 54, x)
 
 /*
  * A metaslab class encompasses a category of allocatable top-level vdevs.
@@ -178,8 +183,8 @@ struct metaslab_class {
 	 * allowed to reserve slots even if we've reached the maximum
 	 * number of allocations allowed.
 	 */
-	uint64_t		mc_alloc_max_slots;
-	refcount_t		mc_alloc_slots;
+	uint64_t		*mc_alloc_max_slots;
+	refcount_t		*mc_alloc_slots;
 
 	uint64_t		mc_alloc_groups; /* # of allocatable groups */
 
@@ -201,9 +206,12 @@ struct metaslab_class {
  */
 struct metaslab_group {
 	kmutex_t		mg_lock;
+	metaslab_t		**mg_primaries;
+	metaslab_t		**mg_secondaries;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
+	uint64_t		mg_ms_ready;
 
 	/*
 	 * A metaslab group is considered to be initialized only after
@@ -223,15 +231,33 @@ struct metaslab_group {
 	metaslab_group_t	*mg_next;
 
 	/*
-	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
-	 * which are tracked by mg_alloc_queue_depth. It's possible for a
-	 * metaslab group to handle more allocations than its max. This
-	 * can occur when gang blocks are required or when other groups
-	 * are unable to handle their share of allocations.
+	 * In order for the allocation throttle to function properly, we cannot
+	 * have too many IOs going to each disk by default; the throttle
+	 * operates by allocating more work to disks that finish quickly, so
+	 * allocating larger chunks to each disk reduces its effectiveness.
+	 * However, if the number of IOs going to each allocator is too small,
+	 * we will not perform proper aggregation at the vdev_queue layer,
+	 * also resulting in decreased performance. Therefore, we will use a
+	 * ramp-up strategy.
+	 *
+	 * Each allocator in each metaslab group has a current queue depth
+	 * (mg_alloc_queue_depth[allocator]) and a current max queue depth
+	 * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
+	 * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
+	 * add IOs to an allocator until the mg_alloc_queue_depth for that
+	 * allocator hits the cur_max. Every time an IO completes for a given
+	 * allocator on a given metaslab group, we increment its cur_max until
+	 * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
+	 * help protect against disks that decrease in performance over time.
+	 *
+	 * It's possible for an allocator to handle more allocations than
+	 * its max. This can occur when gang blocks are required or when other
+	 * groups are unable to handle their share of allocations.
 	 */
 	uint64_t		mg_max_alloc_queue_depth;
-	refcount_t		mg_alloc_queue_depth;
-
+	uint64_t		*mg_cur_max_alloc_queue_depth;
+	refcount_t		*mg_alloc_queue_depth;
+	int			mg_allocators;
 	/*
 	 * A metalab group that can no longer allocate the minimum block
 	 * size will set mg_no_free_space. Once a metaslab group is out
@@ -355,6 +381,13 @@ struct metaslab {
 	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
 
+	/*
+	 * -1 if it's not active in an allocator, otherwise set to the allocator
+	 * this metaslab is active for.
+	 */
+	int		ms_allocator;
+	boolean_t	ms_primary; /* Only valid if ms_allocator is not -1 */
+
 	/*
 	 * The metaslab block allocators can optionally use a size-ordered
 	 * range tree and/or an array of LBAs. Not all allocators use
@@ -369,6 +402,8 @@ struct metaslab {
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+
+	boolean_t	ms_new;
 };
 
 #ifdef	__cplusplus

diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
@@ -239,8 +239,16 @@ struct spa {
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
-	kmutex_t	spa_alloc_lock;
-	avl_tree_t	spa_alloc_tree;
+	/*
+	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
+	 * stored in spa_alloc_count. There is one tree and one lock for each
+	 * allocator, to help improve allocation performance in write-heavy
+	 * workloads.
+	 */
+	kmutex_t	*spa_alloc_locks;
+	avl_tree_t	*spa_alloc_trees;
+	int		spa_alloc_count;
+
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -60,6 +60,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
 struct abd;
 
 extern int zfs_vdev_queue_depth_pct;
+extern int zfs_vdev_def_queue_depth;
 extern uint32_t zfs_vdev_async_write_max_active;
 
 /*

diff --git a/include/sys/zio.h b/include/sys/zio.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
@@ -507,6 +507,7 @@ struct zio {
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
+	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;