diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index e386a974a629..ad01cf1a56cd 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1459,7 +1459,8 @@ ZFS_COMMON_OBJS +=		\
 	zio_compress.o		\
 	zio_inject.o		\
 	zle.o			\
-	zrlock.o
+	zrlock.o		\
+	zthr.o
 
 ZFS_SHARED_OBJS +=		\
 	zfeature_common.o	\
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index e431c1f5bf5b..ac1294f6dfc3 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -274,6 +274,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <sys/zthr.h>
 #include <zfs_fletcher.h>
 
 #ifndef _KERNEL
@@ -282,10 +283,11 @@ boolean_t arc_watch = B_FALSE;
 int arc_procfd;
 #endif
 
-static kmutex_t		arc_reclaim_lock;
-static kcondvar_t	arc_reclaim_thread_cv;
-static boolean_t	arc_reclaim_thread_exit;
-static kcondvar_t	arc_reclaim_waiters_cv;
+static zthr_t		*arc_reap_zthr;
+static zthr_t		*arc_adjust_zthr;
+static kmutex_t		arc_adjust_lock;
+static kcondvar_t	arc_adjust_waiters_cv;
+static boolean_t	arc_adjust_needed = B_FALSE;
 
 uint_t arc_reduce_dnlc_percent = 3;
 
@@ -301,6 +303,9 @@ int zfs_arc_evict_batch_limit = 10;
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* number of milliseconds before attempting a kmem-cache-reap */
+static int		arc_kmem_cache_reap_retry_ms = 1000;
+
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 int		zfs_arc_overflow_shift = 8;
 
@@ -799,8 +804,10 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
+static hrtime_t		arc_growtime;
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
+static hrtime_t		arc_kmem_cache_reap_time;
 
 typedef struct arc_callback arc_callback_t;
 
@@ -1365,7 +1372,7 @@ hdr_recl(void *unused)
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
-		cv_signal(&arc_reclaim_thread_cv);
+		zthr_wakeup(arc_reap_zthr);
 }
 
 static void
@@ -3372,13 +3379,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			 * function should proceed in this case).
 			 *
 			 * If threads are left sleeping, due to not
-			 * using cv_broadcast, they will be woken up
-			 * just before arc_reclaim_thread() sleeps.
+			 * using cv_broadcast here, they will be woken
+			 * up via cv_broadcast in arc_adjust_cb() just
+			 * before arc_adjust_zthr sleeps.
 			 */
-			mutex_enter(&arc_reclaim_lock);
+			mutex_enter(&arc_adjust_lock);
 			if (!arc_is_overflowing())
-				cv_signal(&arc_reclaim_waiters_cv);
-			mutex_exit(&arc_reclaim_lock);
+				cv_signal(&arc_adjust_waiters_cv);
+			mutex_exit(&arc_adjust_lock);
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
@@ -3848,8 +3856,8 @@ arc_flush(spa_t *spa, boolean_t retry)
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
-void
-arc_shrink(int64_t to_free)
+static void
+arc_reduce_target_size(int64_t to_free)
 {
 	if (arc_c > arc_c_min) {
 
@@ -3867,8 +3875,13 @@ arc_shrink(int64_t to_free)
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
-	if (arc_size > arc_c)
-		(void) arc_adjust();
+	if (arc_size > arc_c) {
+		/* See comment in arc_adjust_cb_check() on why lock+flag */
+		mutex_enter(&arc_adjust_lock);
+		arc_adjust_needed = B_TRUE;
+		mutex_exit(&arc_adjust_lock);
+		zthr_wakeup(arc_adjust_zthr);
+	}
 }
 
 typedef enum free_memory_reason_t {
@@ -4071,127 +4084,160 @@ arc_kmem_reap_now(void)
 	}
 }
 
+/* ARGSUSED */
+static boolean_t
+arc_adjust_cb_check(void *arg, zthr_t *zthr)
+{
+	/*
+	 * This is necessary in order for the mdb ::arc dcmd to
+	 * show up to date information. Since the ::arc command
+	 * does not call the kstat's update function, without
+	 * this call, the command may show stale stats for the
+	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+	 * with this change, the data might be up to 1 second
+	 * out of date(the arc_adjust_zthr has a maximum sleep
+	 * time of 1 second); but that should suffice.  The
+	 * arc_state_t structures can be queried directly if more
+	 * accurate information is needed.
+	 */
+	if (arc_ksp != NULL)
+		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+	/*
+	 * We have to rely on arc_get_data_impl() to tell us when to adjust,
+	 * rather than checking if we are overflowing here, so that we are
+	 * sure to not leave arc_get_data_impl() waiting on
+	 * arc_adjust_waiters_cv.  If we have become "not overflowing" since
+	 * arc_get_data_impl() checked, we need to wake it up.  We could
+	 * broadcast the CV here, but arc_get_data_impl() may have not yet
+	 * gone to sleep.  We would need to use a mutex to ensure that this
+	 * function doesn't broadcast until arc_get_data_impl() has gone to
+	 * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
+	 * such a lock would necessarily be incorrect with respect to the
+	 * zthr_lock, which is held before this function is called, and is
+	 * held by arc_get_data_impl() when it calls zthr_wakeup().
+	 */
+	return (arc_adjust_needed);
+}
+
 /*
- * Threads can block in arc_get_data_impl() waiting for this thread to evict
- * enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_impl() are sleeping while holding the hash lock for their
- * particular arc header. Thus, we must be careful to never sleep on a
- * hash lock in this thread. This is to prevent the following deadlock:
- *
- *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
- *    waiting for the reclaim thread to signal it.
- *
- *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
- *    fails, and goes to sleep forever.
- *
- * This possible deadlock is avoided by always acquiring a hash lock
- * using mutex_tryenter() from arc_reclaim_thread().
+ * Keep arc_size under arc_c by running arc_adjust which evicts data
+ * from the ARC.
  */
 /* ARGSUSED */
-static void
-arc_reclaim_thread(void *unused)
+static int
+arc_adjust_cb(void *arg, zthr_t *zthr)
 {
-	hrtime_t		growtime = 0;
-	callb_cpr_t		cpr;
-
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
+	uint64_t evicted = 0;
 
-	mutex_enter(&arc_reclaim_lock);
-	while (!arc_reclaim_thread_exit) {
-		uint64_t evicted = 0;
+	/* Evict from cache */
+	evicted = arc_adjust();
 
+	/*
+	 * If evicted is zero, we couldn't evict anything
+	 * via arc_adjust(). This could be due to hash lock
+	 * collisions, but more likely due to the majority of
+	 * arc buffers being unevictable. Therefore, even if
+	 * arc_size is above arc_c, another pass is unlikely to
+	 * be helpful and could potentially cause us to enter an
+	 * infinite loop.  Additionally, zthr_iscancelled() is
+	 * checked here so that if the arc is shutting down, the
+	 * broadcast will wake any remaining arc adjust waiters.
+	 */
+	mutex_enter(&arc_adjust_lock);
+	arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
+	    evicted > 0 && arc_size > arc_c;
+	if (!arc_adjust_needed) {
 		/*
-		 * This is necessary in order for the mdb ::arc dcmd to
-		 * show up to date information. Since the ::arc command
-		 * does not call the kstat's update function, without
-		 * this call, the command may show stale stats for the
-		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
-		 * with this change, the data might be up to 1 second
-		 * out of date; but that should suffice. The arc_state_t
-		 * structures can be queried directly if more accurate
-		 * information is needed.
+		 * We're either no longer overflowing, or we
+		 * can't evict anything more, so we should wake
+		 * up any waiters.
 		 */
-		if (arc_ksp != NULL)
-			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+		cv_broadcast(&arc_adjust_waiters_cv);
+	}
+	mutex_exit(&arc_adjust_lock);
 
-		mutex_exit(&arc_reclaim_lock);
+	return (0);
+}
 
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory = arc_available_memory();
+
+	if (free_memory < 0) {
+		arc_no_grow = B_TRUE;
+		arc_warm = B_TRUE;
 		/*
-		 * We call arc_adjust() before (possibly) calling
-		 * arc_kmem_reap_now(), so that we can wake up
-		 * arc_get_data_impl() sooner.
+		 * Wait at least zfs_grow_retry (default 60) seconds
+		 * before considering growing.
 		 */
-		evicted = arc_adjust();
+		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+		return (B_TRUE);
+	} else if (free_memory < arc_c >> arc_no_grow_shift) {
+		arc_no_grow = B_TRUE;
+	} else if (gethrtime() >= arc_growtime) {
+		arc_no_grow = B_FALSE;
+	}
 
-		int64_t free_memory = arc_available_memory();
-		if (free_memory < 0) {
+	return (B_FALSE);
+}
 
-			arc_no_grow = B_TRUE;
-			arc_warm = B_TRUE;
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches.  To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_adjust_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static int
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory;
 
-			/*
-			 * Wait at least zfs_grow_retry (default 60) seconds
-			 * before considering growing.
-			 */
-			growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+	/*
+	 * Wait at least arc_kmem_cache_reap_retry_ms between
+	 * arc_kmem_reap_now() calls. Without this check it is possible
+	 * to end up in a situation where we spend lots of time reaping
+	 * caches, while we're near arc_c_min.
+	 */
+	if (gethrtime() >= arc_kmem_cache_reap_time) {
+		arc_kmem_cache_reap_time = gethrtime() +
+		    MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
 
-			arc_kmem_reap_now();
+		/*
+		 * We call arc_kmem_reap_now() after determining the
+		 * next value for arc_kmem_cache_reap_time because
+		 * arc_kmem_reap_now() can potentially take minutes to
+		 * return. Thus, we don't want that additional latency
+		 * to impact the value of arc_kmem_cache_reap_time.
+		 */
+		arc_kmem_reap_now();
+	}
 
-			/*
-			 * If we are still low on memory, shrink the ARC
-			 * so that we have arc_shrink_min free space.
-			 */
-			free_memory = arc_available_memory();
+	/*
+	 * Reduce the target size as needed to maintain the
+	 * amount of free memory in the system at a fraction,
+	 * 1/128th by default, of the arc_size.  If
+	 * oversubscribed(free_memory < 0) then reduce the
+	 * target arc_size by the deficit amount plus the
+	 * fractional amount.  If free memory is positive
+	 * but less then the fractional amount, reduce by
+	 * what is needed to hit the fractional amount.
+	 */
+	free_memory = arc_available_memory();
 
-			int64_t to_free =
-			    (arc_c >> arc_shrink_shift) - free_memory;
-			if (to_free > 0) {
+	int64_t to_free =
+	    (arc_c >> arc_shrink_shift) - free_memory;
+	if (to_free > 0) {
 #ifdef _KERNEL
-				to_free = MAX(to_free, ptob(needfree));
+		to_free = MAX(to_free, ptob(needfree));
 #endif
-				arc_shrink(to_free);
-			}
-		} else if (free_memory < arc_c >> arc_no_grow_shift) {
-			arc_no_grow = B_TRUE;
-		} else if (gethrtime() >= growtime) {
-			arc_no_grow = B_FALSE;
-		}
-
-		mutex_enter(&arc_reclaim_lock);
-
-		/*
-		 * If evicted is zero, we couldn't evict anything via
-		 * arc_adjust(). This could be due to hash lock
-		 * collisions, but more likely due to the majority of
-		 * arc buffers being unevictable. Therefore, even if
-		 * arc_size is above arc_c, another pass is unlikely to
-		 * be helpful and could potentially cause us to enter an
-		 * infinite loop.
-		 */
-		if (arc_size <= arc_c || evicted == 0) {
-			/*
-			 * We're either no longer overflowing, or we
-			 * can't evict anything more, so we should wake
-			 * up any threads before we go to sleep.
-			 */
-			cv_broadcast(&arc_reclaim_waiters_cv);
-
-			/*
-			 * Block until signaled, or after one second (we
-			 * might need to perform arc_kmem_reap_now()
-			 * even if we aren't being signalled)
-			 */
-			CALLB_CPR_SAFE_BEGIN(&cpr);
-			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
-			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
-			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
-		}
+		arc_reduce_target_size(to_free);
 	}
 
-	arc_reclaim_thread_exit = B_FALSE;
-	cv_broadcast(&arc_reclaim_thread_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
-	thread_exit();
+	return (0);
 }
 
 /*
@@ -4235,11 +4281,15 @@ arc_adapt(int bytes, arc_state_t *state)
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
+	/*
+	 * Wake reap thread if we do not have any available memory
+	 */
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thread_cv);
+		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
+
 	if (arc_no_grow)
 		return;
 
@@ -4333,7 +4383,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 	 * overflowing; thus we don't use a while loop here.
 	 */
 	if (arc_is_overflowing()) {
-		mutex_enter(&arc_reclaim_lock);
+		mutex_enter(&arc_adjust_lock);
 
 		/*
 		 * Now that we've acquired the lock, we may no longer be
@@ -4347,11 +4397,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		 * shouldn't cause any harm.
 		 */
 		if (arc_is_overflowing()) {
-			cv_signal(&arc_reclaim_thread_cv);
-			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+			arc_adjust_needed = B_TRUE;
+			zthr_wakeup(arc_adjust_zthr);
+			(void) cv_wait(&arc_adjust_waiters_cv,
+			    &arc_adjust_lock);
 		}
-
-		mutex_exit(&arc_reclaim_lock);
+		mutex_exit(&arc_adjust_lock);
 	}
 
 	VERIFY3U(hdr->b_type, ==, type);
@@ -5986,10 +6037,8 @@ arc_init(void)
 #else
 	uint64_t allmem = (physmem * PAGESIZE) / 2;
 #endif
-
-	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
@@ -6077,7 +6126,6 @@ arc_init(void)
 	arc_state_init();
 	buf_init();
 
-	arc_reclaim_thread_exit = B_FALSE;
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -6088,8 +6136,10 @@ arc_init(void)
 		kstat_install(arc_ksp);
 	}
 
-	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
+	arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
+	    arc_adjust_cb, NULL);
+	arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
+	    arc_reap_cb, NULL, SEC2NSEC(1));
 
 	arc_dead = B_FALSE;
 	arc_warm = B_FALSE;
@@ -6113,18 +6163,6 @@ arc_init(void)
 void
 arc_fini(void)
 {
-	mutex_enter(&arc_reclaim_lock);
-	arc_reclaim_thread_exit = B_TRUE;
-	/*
-	 * The reclaim thread will set arc_reclaim_thread_exit back to
-	 * B_FALSE when it is finished exiting; we're waiting for that.
-	 */
-	while (arc_reclaim_thread_exit) {
-		cv_signal(&arc_reclaim_thread_cv);
-		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
-	}
-	mutex_exit(&arc_reclaim_lock);
-
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
@@ -6135,9 +6173,14 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 
-	mutex_destroy(&arc_reclaim_lock);
-	cv_destroy(&arc_reclaim_thread_cv);
-	cv_destroy(&arc_reclaim_waiters_cv);
+	(void) zthr_cancel(arc_adjust_zthr);
+	zthr_destroy(arc_adjust_zthr);
+
+	(void) zthr_cancel(arc_reap_zthr);
+	zthr_destroy(arc_reap_zthr);
+
+	mutex_destroy(&arc_adjust_lock);
+	cv_destroy(&arc_adjust_waiters_cv);
 
 	arc_state_fini();
 	buf_fini();
diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h
new file mode 100644
index 000000000000..62d47df6f969
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zthr.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZTHR_H
+#define	_SYS_ZTHR_H
+
+typedef struct zthr zthr_t;
+typedef int (zthr_func_t)(void *, zthr_t *);
+typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *);
+
+struct zthr {
+	kthread_t	*zthr_thread;
+	kmutex_t	zthr_lock;
+	kcondvar_t	zthr_cv;
+	boolean_t	zthr_cancel;
+	hrtime_t	zthr_wait_time;
+
+	zthr_checkfunc_t	*zthr_checkfunc;
+	zthr_func_t	*zthr_func;
+	void		*zthr_arg;
+	int		zthr_rc;
+};
+
+extern zthr_t *zthr_create(zthr_checkfunc_t *checkfunc,
+    zthr_func_t *func, void *arg);
+extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc,
+    zthr_func_t *func, void *arg, hrtime_t nano_wait);
+
+extern void zthr_exit(zthr_t *t, int rc);
+extern void zthr_destroy(zthr_t *t);
+
+extern void zthr_wakeup(zthr_t *t);
+extern int zthr_cancel(zthr_t *t);
+extern void zthr_resume(zthr_t *t);
+
+extern boolean_t zthr_iscancelled(zthr_t *t);
+extern boolean_t zthr_isrunning(zthr_t *t);
+
+#endif /* _SYS_ZTHR_H */
diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c
new file mode 100644
index 000000000000..f26f911cb00e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zthr.c
@@ -0,0 +1,343 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZTHR Infrastructure
+ * ===================
+ *
+ * ZTHR threads are used for isolated operations that span multiple txgs
+ * within a SPA. They generally exist from SPA creation/loading and until
+ * the SPA is exported/destroyed. The ideal requirements for an operation
+ * to be modeled with a zthr are the following:
+ *
+ * 1] The operation needs to run over multiple txgs.
+ * 2] There is be a single point of reference in memory or on disk that
+ *    indicates whether the operation should run/is running or is
+ *    stopped.
+ *
+ * If the operation satisfies the above then the following rules guarantee
+ * a certain level of correctness:
+ *
+ * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
+ *    to running but not the opposite.
+ * 2] Only the zthr can change the work indicator from running to stopped
+ *    (e.g. when it is done) but not the opposite.
+ *
+ * This way a normal zthr cycle should go like this:
+ *
+ * 1] An external thread changes the work indicator from stopped to
+ *    running and wakes up the zthr.
+ * 2] The zthr wakes up, checks the indicator and starts working.
+ * 3] When the zthr is done, it changes the indicator to stopped, allowing
+ *    a new cycle to start.
+ *
+ * Besides being awakened by other threads, a zthr can be configured
+ * during creation to wakeup on it's own after a specified interval
+ * [see zthr_create_timer()].
+ *
+ * == ZTHR creation
+ *
+ * Every zthr needs three inputs to start running:
+ *
+ * 1] A user-defined checker function (checkfunc) that decides whether
+ *    the zthr should start working or go to sleep. The function should
+ *    return TRUE when the zthr needs to work or FALSE to let it sleep,
+ *    and should adhere to the following signature:
+ *    boolean_t checkfunc_name(void *args, zthr_t *t);
+ *
+ * 2] A user-defined ZTHR function (func) which the zthr executes when
+ *    it is not sleeping. The function should adhere to the following
+ *    signature type:
+ *    int func_name(void *args, zthr_t *t);
+ *
+ * 3] A void args pointer that will be passed to checkfunc and func
+ *    implicitly by the infrastructure.
+ *
+ * The reason why the above API needs two different functions,
+ * instead of one that both checks and does the work, has to do with
+ * the zthr's internal lock (zthr_lock) and the allowed cancellation
+ * windows. We want to hold the zthr_lock while running checkfunc
+ * but not while running func. This way the zthr can be cancelled
+ * while doing work and not while checking for work.
+ *
+ * To start a zthr:
+ *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ * or
+ *     zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
+ *         args, max_sleep);
+ *
+ * After that you should be able to wakeup, cancel, and resume the
+ * zthr from another thread using zthr_pointer.
+ *
+ * NOTE: ZTHR threads could potentially wake up spuriously and the
+ * user should take this into account when writing a checkfunc.
+ * [see ZTHR state transitions]
+ *
+ * == ZTHR cancellation
+ *
+ * ZTHR threads must be cancelled when their SPA is being exported
+ * or when they need to be paused so they don't interfere with other
+ * operations.
+ *
+ * To cancel a zthr:
+ *     zthr_cancel(zthr_pointer);
+ *
+ * To resume it:
+ *     zthr_resume(zthr_pointer);
+ *
+ * A zthr will implicitly check if it has received a cancellation
+ * signal every time func returns and everytime it wakes up [see ZTHR
+ * state transitions below].
+ *
+ * At times, waiting for the zthr's func to finish its job may take
+ * time. This may be very time-consuming for some operations that
+ * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
+ * the user can explicitly make their ZTHR function aware of incoming
+ * cancellation signals using zthr_iscancelled(). A common pattern for
+ * that looks like this:
+ *
+ * int
+ * func_name(void *args, zthr_t *t)
+ * {
+ *     ... <unpack args> ...
+ *     while (!work_done && !zthr_iscancelled(t)) {
+ *         ... <do more work> ...
+ *     }
+ *     return (0);
+ * }
+ *
+ * == ZTHR exit
+ *
+ * For the rare cases where the zthr wants to stop running voluntarily
+ * while running its ZTHR function (func), we provide zthr_exit().
+ * When a zthr has voluntarily stopped running, it can be resumed with
+ * zthr_resume(), just like it would if it was cancelled by some other
+ * thread.
+ *
+ * == ZTHR cleanup
+ *
+ * Cancelling a zthr doesn't clean up its metadata (internal locks,
+ * function pointers to func and checkfunc, etc..). This is because
+ * we want to keep them around in case we want to resume the execution
+ * of the zthr later. Similarly for zthrs that exit themselves.
+ *
+ * To completely cleanup a zthr, cancel it first to ensure that it
+ * is not running and then use zthr_destroy().
+ *
+ * == ZTHR state transitions
+ *
+ *    zthr creation
+ *      +
+ *      |
+ *      |      woke up
+ *      |   +--------------+ sleep
+ *      |   |                  ^
+ *      |   |                  |
+ *      |   |                  | FALSE
+ *      |   |                  |
+ *      v   v     FALSE        +
+ *   cancelled? +---------> checkfunc?
+ *      +   ^                  +
+ *      |   |                  |
+ *      |   |                  | TRUE
+ *      |   |                  |
+ *      |   |  func returned   v
+ *      |   +---------------+ func
+ *      |
+ *      | TRUE
+ *      |
+ *      v
+ *   zthr stopped running
+ *
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zthr.h>
+
+void
+zthr_exit(zthr_t *t, int rc)
+{
+	ASSERT3P(t->zthr_thread, ==, curthread);
+	mutex_enter(&t->zthr_lock);
+	t->zthr_thread = NULL;
+	t->zthr_rc = rc;
+	cv_broadcast(&t->zthr_cv);
+	mutex_exit(&t->zthr_lock);
+	thread_exit();
+}
+
+static void
+zthr_procedure(void *arg)
+{
+	zthr_t *t = arg;
+	int rc = 0;
+
+	mutex_enter(&t->zthr_lock);
+	while (!t->zthr_cancel) {
+		if (t->zthr_checkfunc(t->zthr_arg, t)) {
+			mutex_exit(&t->zthr_lock);
+			rc = t->zthr_func(t->zthr_arg, t);
+			mutex_enter(&t->zthr_lock);
+		} else {
+			/* go to sleep */
+			if (t->zthr_wait_time == 0) {
+				cv_wait(&t->zthr_cv, &t->zthr_lock);
+			} else {
+				(void) cv_timedwait_hires(&t->zthr_cv,
+				    &t->zthr_lock, t->zthr_wait_time,
+				    MSEC2NSEC(1), 0);
+			}
+		}
+	}
+	mutex_exit(&t->zthr_lock);
+
+	zthr_exit(t, rc);
+}
+
+zthr_t *
+zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
+{
+	return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
+}
+
+/*
+ * Create a zthr with specified maximum sleep time.  If the time
+ * in sleeping state exceeds max_sleep, a wakeup(do the check and
+ * start working if required) will be triggered.
+ */
+zthr_t *
+zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
+    void *arg, hrtime_t max_sleep)
+{
+	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+	mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_enter(&t->zthr_lock);
+	t->zthr_checkfunc = checkfunc;
+	t->zthr_func = func;
+	t->zthr_arg = arg;
+	t->zthr_wait_time = max_sleep;
+
+	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+	    0, &p0, TS_RUN, minclsyspri);
+	mutex_exit(&t->zthr_lock);
+
+	return (t);
+}
+
+void
+zthr_destroy(zthr_t *t)
+{
+	VERIFY3P(t->zthr_thread, ==, NULL);
+	mutex_destroy(&t->zthr_lock);
+	cv_destroy(&t->zthr_cv);
+	kmem_free(t, sizeof (*t));
+}
+
+/*
+ * Note: If the zthr is not sleeping and misses the wakeup
+ * (e.g it is running its ZTHR function), it will check if
+ * there is work to do before going to sleep using its checker
+ * function [see ZTHR state transition in ZTHR block comment].
+ * Thus, missing the wakeup still yields the expected behavior.
+ */
+void
+zthr_wakeup(zthr_t *t)
+{
+	mutex_enter(&t->zthr_lock);
+	cv_broadcast(&t->zthr_cv);
+	mutex_exit(&t->zthr_lock);
+}
+
+/*
+ * Note: If the zthr is not running (e.g. has been cancelled
+ * already), this is a no-op.
+ */
+int
+zthr_cancel(zthr_t *t)
+{
+	int rc = 0;
+
+	mutex_enter(&t->zthr_lock);
+
+	/* broadcast in case the zthr is sleeping */
+	cv_broadcast(&t->zthr_cv);
+
+	t->zthr_cancel = B_TRUE;
+	while (t->zthr_thread != NULL)
+		cv_wait(&t->zthr_cv, &t->zthr_lock);
+	t->zthr_cancel = B_FALSE;
+	rc = t->zthr_rc;
+	mutex_exit(&t->zthr_lock);
+
+	return (rc);
+}
+
+void
+zthr_resume(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, ==, NULL);
+
+	mutex_enter(&t->zthr_lock);
+
+	ASSERT3P(&t->zthr_checkfunc, !=, NULL);
+	ASSERT3P(&t->zthr_func, !=, NULL);
+	ASSERT(!t->zthr_cancel);
+
+	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+	    0, &p0, TS_RUN, minclsyspri);
+
+	mutex_exit(&t->zthr_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * to check if another thread has signal it to stop running.
+ *
+ * returns TRUE if we are in the middle of trying to cancel
+ *     this thread.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_iscancelled(zthr_t *t)
+{
+	boolean_t cancelled;
+
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	mutex_enter(&t->zthr_lock);
+	cancelled = t->zthr_cancel;
+	mutex_exit(&t->zthr_lock);
+
+	return (cancelled);
+}
+
+boolean_t
+zthr_isrunning(zthr_t *t)
+{
+	boolean_t running;
+
+	mutex_enter(&t->zthr_lock);
+	running = (t->zthr_thread != NULL);
+	mutex_exit(&t->zthr_lock);
+
+	return (running);
+}