diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index e386a974a629..ad01cf1a56cd 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1459,7 +1459,8 @@ ZFS_COMMON_OBJS += \ zio_compress.o \ zio_inject.o \ zle.o \ - zrlock.o + zrlock.o \ + zthr.o ZFS_SHARED_OBJS += \ zfeature_common.o \ diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index e431c1f5bf5b..ac1294f6dfc3 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -274,6 +274,7 @@ #endif #include #include +#include #include #ifndef _KERNEL @@ -282,10 +283,11 @@ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif -static kmutex_t arc_reclaim_lock; -static kcondvar_t arc_reclaim_thread_cv; -static boolean_t arc_reclaim_thread_exit; -static kcondvar_t arc_reclaim_waiters_cv; +static zthr_t *arc_reap_zthr; +static zthr_t *arc_adjust_zthr; +static kmutex_t arc_adjust_lock; +static kcondvar_t arc_adjust_waiters_cv; +static boolean_t arc_adjust_needed = B_FALSE; uint_t arc_reduce_dnlc_percent = 3; @@ -301,6 +303,9 @@ int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* number of milliseconds before attempting a kmem-cache-reap */ +static int arc_kmem_cache_reap_retry_ms = 1000; + /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; @@ -799,8 +804,10 @@ static arc_state_t *arc_l2c_only; #define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ +static hrtime_t arc_growtime; static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; +static hrtime_t arc_kmem_cache_reap_time; typedef struct arc_callback arc_callback_t; @@ -1365,7 +1372,7 @@ hdr_recl(void *unused) * which is after we do arc_fini(). */ if (!arc_dead) - cv_signal(&arc_reclaim_thread_cv); + zthr_wakeup(arc_reap_zthr); } static void @@ -3372,13 +3379,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * function should proceed in this case). * * If threads are left sleeping, due to not - * using cv_broadcast, they will be woken up - * just before arc_reclaim_thread() sleeps. + * using cv_broadcast here, they will be woken + * up via cv_broadcast in arc_adjust_cb() just + * before arc_adjust_zthr sleeps. */ - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); if (!arc_is_overflowing()) - cv_signal(&arc_reclaim_waiters_cv); - mutex_exit(&arc_reclaim_lock); + cv_signal(&arc_adjust_waiters_cv); + mutex_exit(&arc_adjust_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -3848,8 +3856,8 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } -void -arc_shrink(int64_t to_free) +static void +arc_reduce_target_size(int64_t to_free) { if (arc_c > arc_c_min) { @@ -3867,8 +3875,13 @@ arc_shrink(int64_t to_free) ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) - (void) arc_adjust(); + if (arc_size > arc_c) { + /* See comment in arc_adjust_cb_check() on why lock+flag */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + mutex_exit(&arc_adjust_lock); + zthr_wakeup(arc_adjust_zthr); + } } typedef enum free_memory_reason_t { @@ -4071,127 +4084,160 @@ arc_kmem_reap_now(void) } } +/* ARGSUSED */ +static boolean_t +arc_adjust_cb_check(void *arg, zthr_t *zthr) +{ + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date(the arc_adjust_zthr has a maximum sleep + * time of 1 second); but that should suffice. The + * arc_state_t structures can be queried directly if more + * accurate information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + /* + * We have to rely on arc_get_data_impl() to tell us when to adjust, + * rather than checking if we are overflowing here, so that we are + * sure to not leave arc_get_data_impl() waiting on + * arc_adjust_waiters_cv. If we have become "not overflowing" since + * arc_get_data_impl() checked, we need to wake it up. We could + * broadcast the CV here, but arc_get_data_impl() may have not yet + * gone to sleep. We would need to use a mutex to ensure that this + * function doesn't broadcast until arc_get_data_impl() has gone to + * sleep (e.g. the arc_adjust_lock). However, the lock ordering of + * such a lock would necessarily be incorrect with respect to the + * zthr_lock, which is held before this function is called, and is + * held by arc_get_data_impl() when it calls zthr_wakeup(). + */ + return (arc_adjust_needed); +} + /* - * Threads can block in arc_get_data_impl() waiting for this thread to evict - * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_impl() are sleeping while holding the hash lock for their - * particular arc header. Thus, we must be careful to never sleep on a - * hash lock in this thread. This is to prevent the following deadlock: - * - * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", - * waiting for the reclaim thread to signal it. - * - * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, - * fails, and goes to sleep forever. - * - * This possible deadlock is avoided by always acquiring a hash lock - * using mutex_tryenter() from arc_reclaim_thread(). + * Keep arc_size under arc_c by running arc_adjust which evicts data + * from the ARC. */ /* ARGSUSED */ -static void -arc_reclaim_thread(void *unused) +static int +arc_adjust_cb(void *arg, zthr_t *zthr) { - hrtime_t growtime = 0; - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); + uint64_t evicted = 0; - mutex_enter(&arc_reclaim_lock); - while (!arc_reclaim_thread_exit) { - uint64_t evicted = 0; + /* Evict from cache */ + evicted = arc_adjust(); + /* + * If evicted is zero, we couldn't evict anything + * via arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. Additionally, zthr_iscancelled() is + * checked here so that if the arc is shutting down, the + * broadcast will wake any remaining arc adjust waiters. + */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && + evicted > 0 && arc_size > arc_c; + if (!arc_adjust_needed) { /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any waiters. */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); + cv_broadcast(&arc_adjust_waiters_cv); + } + mutex_exit(&arc_adjust_lock); - mutex_exit(&arc_reclaim_lock); + return (0); +} +/* ARGSUSED */ +static boolean_t +arc_reap_cb_check(void *arg, zthr_t *zthr) +{ + int64_t free_memory = arc_available_memory(); + + if (free_memory < 0) { + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; /* - * We call arc_adjust() before (possibly) calling - * arc_kmem_reap_now(), so that we can wake up - * arc_get_data_impl() sooner. + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. */ - evicted = arc_adjust(); + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + return (B_TRUE); + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= arc_growtime) { + arc_no_grow = B_FALSE; + } - int64_t free_memory = arc_available_memory(); - if (free_memory < 0) { + return (B_FALSE); +} - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; +/* + * Keep enough free memory in the system by reaping the ARC's kmem + * caches. To cause more slabs to be reapable, we may reduce the + * target size of the cache (arc_c), causing the arc_adjust_cb() + * to free more buffers. + */ +/* ARGSUSED */ +static int +arc_reap_cb(void *arg, zthr_t *zthr) +{ + int64_t free_memory; - /* - * Wait at least zfs_grow_retry (default 60) seconds - * before considering growing. - */ - growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + /* + * Wait at least arc_kmem_cache_reap_retry_ms between + * arc_kmem_reap_now() calls. Without this check it is possible + * to end up in a situation where we spend lots of time reaping + * caches, while we're near arc_c_min. + */ + if (gethrtime() >= arc_kmem_cache_reap_time) { + arc_kmem_cache_reap_time = gethrtime() + + MSEC2NSEC(arc_kmem_cache_reap_retry_ms); - arc_kmem_reap_now(); + /* + * We call arc_kmem_reap_now() after determining the + * next value for arc_kmem_cache_reap_time because + * arc_kmem_reap_now() can potentially take minutes to + * return. Thus, we don't want that additional latency + * to impact the value of arc_kmem_cache_reap_time. + */ + arc_kmem_reap_now(); + } - /* - * If we are still low on memory, shrink the ARC - * so that we have arc_shrink_min free space. - */ - free_memory = arc_available_memory(); + /* + * Reduce the target size as needed to maintain the + * amount of free memory in the system at a fraction, + * 1/128th by default, of the arc_size. If + * oversubscribed(free_memory < 0) then reduce the + * target arc_size by the deficit amount plus the + * fractional amount. If free memory is positive + * but less then the fractional amount, reduce by + * what is needed to hit the fractional amount. + */ + free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { #ifdef _KERNEL - to_free = MAX(to_free, ptob(needfree)); + to_free = MAX(to_free, ptob(needfree)); #endif - arc_shrink(to_free); - } - } else if (free_memory < arc_c >> arc_no_grow_shift) { - arc_no_grow = B_TRUE; - } else if (gethrtime() >= growtime) { - arc_no_grow = B_FALSE; - } - - mutex_enter(&arc_reclaim_lock); - - /* - * If evicted is zero, we couldn't evict anything via - * arc_adjust(). This could be due to hash lock - * collisions, but more likely due to the majority of - * arc buffers being unevictable. Therefore, even if - * arc_size is above arc_c, another pass is unlikely to - * be helpful and could potentially cause us to enter an - * infinite loop. - */ - if (arc_size <= arc_c || evicted == 0) { - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any threads before we go to sleep. - */ - cv_broadcast(&arc_reclaim_waiters_cv); - - /* - * Block until signaled, or after one second (we - * might need to perform arc_kmem_reap_now() - * even if we aren't being signalled) - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_hires(&arc_reclaim_thread_cv, - &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); - } + arc_reduce_target_size(to_free); } - arc_reclaim_thread_exit = B_FALSE; - cv_broadcast(&arc_reclaim_thread_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ - thread_exit(); + return (0); } /* @@ -4235,11 +4281,15 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); + /* + * Wake reap thread if we do not have any available memory + */ if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thread_cv); + zthr_wakeup(arc_reap_zthr); return; } + if (arc_no_grow) return; @@ -4333,7 +4383,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); /* * Now that we've acquired the lock, we may no longer be @@ -4347,11 +4397,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * shouldn't cause any harm. */ if (arc_is_overflowing()) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + (void) cv_wait(&arc_adjust_waiters_cv, + &arc_adjust_lock); } - - mutex_exit(&arc_reclaim_lock); + mutex_exit(&arc_adjust_lock); } VERIFY3U(hdr->b_type, ==, type); @@ -5986,10 +6037,8 @@ arc_init(void) #else uint64_t allmem = (physmem * PAGESIZE) / 2; #endif - - mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); - cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -6077,7 +6126,6 @@ arc_init(void) arc_state_init(); buf_init(); - arc_reclaim_thread_exit = B_FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -6088,8 +6136,10 @@ arc_init(void) kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + arc_adjust_zthr = zthr_create(arc_adjust_cb_check, + arc_adjust_cb, NULL); + arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, + arc_reap_cb, NULL, SEC2NSEC(1)); arc_dead = B_FALSE; arc_warm = B_FALSE; @@ -6113,18 +6163,6 @@ arc_init(void) void arc_fini(void) { - mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = B_TRUE; - /* - * The reclaim thread will set arc_reclaim_thread_exit back to - * B_FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_reclaim_thread_exit) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); - } - mutex_exit(&arc_reclaim_lock); - /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); @@ -6135,9 +6173,14 @@ arc_fini(void) arc_ksp = NULL; } - mutex_destroy(&arc_reclaim_lock); - cv_destroy(&arc_reclaim_thread_cv); - cv_destroy(&arc_reclaim_waiters_cv); + (void) zthr_cancel(arc_adjust_zthr); + zthr_destroy(arc_adjust_zthr); + + (void) zthr_cancel(arc_reap_zthr); + zthr_destroy(arc_reap_zthr); + + mutex_destroy(&arc_adjust_lock); + cv_destroy(&arc_adjust_waiters_cv); arc_state_fini(); buf_fini(); diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h new file mode 100644 index 000000000000..62d47df6f969 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zthr.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZTHR_H +#define _SYS_ZTHR_H + +typedef struct zthr zthr_t; +typedef int (zthr_func_t)(void *, zthr_t *); +typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); + +struct zthr { + kthread_t *zthr_thread; + kmutex_t zthr_lock; + kcondvar_t zthr_cv; + boolean_t zthr_cancel; + hrtime_t zthr_wait_time; + + zthr_checkfunc_t *zthr_checkfunc; + zthr_func_t *zthr_func; + void *zthr_arg; + int zthr_rc; +}; + +extern zthr_t *zthr_create(zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg); +extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t nano_wait); + +extern void zthr_exit(zthr_t *t, int rc); +extern void zthr_destroy(zthr_t *t); + +extern void zthr_wakeup(zthr_t *t); +extern int zthr_cancel(zthr_t *t); +extern void zthr_resume(zthr_t *t); + +extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_isrunning(zthr_t *t); + +#endif /* _SYS_ZTHR_H */ diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c new file mode 100644 index 000000000000..f26f911cb00e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zthr.c @@ -0,0 +1,343 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * ZTHR Infrastructure + * =================== + * + * ZTHR threads are used for isolated operations that span multiple txgs + * within a SPA. They generally exist from SPA creation/loading and until + * the SPA is exported/destroyed. The ideal requirements for an operation + * to be modeled with a zthr are the following: + * + * 1] The operation needs to run over multiple txgs. + * 2] There is be a single point of reference in memory or on disk that + * indicates whether the operation should run/is running or is + * stopped. + * + * If the operation satisfies the above then the following rules guarantee + * a certain level of correctness: + * + * 1] Any thread EXCEPT the zthr changes the work indicator from stopped + * to running but not the opposite. + * 2] Only the zthr can change the work indicator from running to stopped + * (e.g. when it is done) but not the opposite. + * + * This way a normal zthr cycle should go like this: + * + * 1] An external thread changes the work indicator from stopped to + * running and wakes up the zthr. + * 2] The zthr wakes up, checks the indicator and starts working. + * 3] When the zthr is done, it changes the indicator to stopped, allowing + * a new cycle to start. + * + * Besides being awakened by other threads, a zthr can be configured + * during creation to wakeup on it's own after a specified interval + * [see zthr_create_timer()]. + * + * == ZTHR creation + * + * Every zthr needs three inputs to start running: + * + * 1] A user-defined checker function (checkfunc) that decides whether + * the zthr should start working or go to sleep. The function should + * return TRUE when the zthr needs to work or FALSE to let it sleep, + * and should adhere to the following signature: + * boolean_t checkfunc_name(void *args, zthr_t *t); + * + * 2] A user-defined ZTHR function (func) which the zthr executes when + * it is not sleeping. The function should adhere to the following + * signature type: + * int func_name(void *args, zthr_t *t); + * + * 3] A void args pointer that will be passed to checkfunc and func + * implicitly by the infrastructure. + * + * The reason why the above API needs two different functions, + * instead of one that both checks and does the work, has to do with + * the zthr's internal lock (zthr_lock) and the allowed cancellation + * windows. We want to hold the zthr_lock while running checkfunc + * but not while running func. This way the zthr can be cancelled + * while doing work and not while checking for work. + * + * To start a zthr: + * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * or + * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, + * args, max_sleep); + * + * After that you should be able to wakeup, cancel, and resume the + * zthr from another thread using zthr_pointer. + * + * NOTE: ZTHR threads could potentially wake up spuriously and the + * user should take this into account when writing a checkfunc. + * [see ZTHR state transitions] + * + * == ZTHR cancellation + * + * ZTHR threads must be cancelled when their SPA is being exported + * or when they need to be paused so they don't interfere with other + * operations. + * + * To cancel a zthr: + * zthr_cancel(zthr_pointer); + * + * To resume it: + * zthr_resume(zthr_pointer); + * + * A zthr will implicitly check if it has received a cancellation + * signal every time func returns and everytime it wakes up [see ZTHR + * state transitions below]. + * + * At times, waiting for the zthr's func to finish its job may take + * time. This may be very time-consuming for some operations that + * need to cancel the SPA's zthrs (e.g spa_export). For this scenario + * the user can explicitly make their ZTHR function aware of incoming + * cancellation signals using zthr_iscancelled(). A common pattern for + * that looks like this: + * + * int + * func_name(void *args, zthr_t *t) + * { + * ... ... + * while (!work_done && !zthr_iscancelled(t)) { + * ... ... + * } + * return (0); + * } + * + * == ZTHR exit + * + * For the rare cases where the zthr wants to stop running voluntarily + * while running its ZTHR function (func), we provide zthr_exit(). + * When a zthr has voluntarily stopped running, it can be resumed with + * zthr_resume(), just like it would if it was cancelled by some other + * thread. + * + * == ZTHR cleanup + * + * Cancelling a zthr doesn't clean up its metadata (internal locks, + * function pointers to func and checkfunc, etc..). This is because + * we want to keep them around in case we want to resume the execution + * of the zthr later. Similarly for zthrs that exit themselves. + * + * To completely cleanup a zthr, cancel it first to ensure that it + * is not running and then use zthr_destroy(). + * + * == ZTHR state transitions + * + * zthr creation + * + + * | + * | woke up + * | +--------------+ sleep + * | | ^ + * | | | + * | | | FALSE + * | | | + * v v FALSE + + * cancelled? +---------> checkfunc? + * + ^ + + * | | | + * | | | TRUE + * | | | + * | | func returned v + * | +---------------+ func + * | + * | TRUE + * | + * v + * zthr stopped running + * + */ + +#include +#include + +void +zthr_exit(zthr_t *t, int rc) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + mutex_enter(&t->zthr_lock); + t->zthr_thread = NULL; + t->zthr_rc = rc; + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); + thread_exit(); +} + +static void +zthr_procedure(void *arg) +{ + zthr_t *t = arg; + int rc = 0; + + mutex_enter(&t->zthr_lock); + while (!t->zthr_cancel) { + if (t->zthr_checkfunc(t->zthr_arg, t)) { + mutex_exit(&t->zthr_lock); + rc = t->zthr_func(t->zthr_arg, t); + mutex_enter(&t->zthr_lock); + } else { + /* go to sleep */ + if (t->zthr_wait_time == 0) { + cv_wait(&t->zthr_cv, &t->zthr_lock); + } else { + (void) cv_timedwait_hires(&t->zthr_cv, + &t->zthr_lock, t->zthr_wait_time, + MSEC2NSEC(1), 0); + } + } + } + mutex_exit(&t->zthr_lock); + + zthr_exit(t, rc); +} + +zthr_t * +zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) +{ + return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); +} + +/* + * Create a zthr with specified maximum sleep time. If the time + * in sleeping state exceeds max_sleep, a wakeup(do the check and + * start working if required) will be triggered. + */ +zthr_t * +zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, + void *arg, hrtime_t max_sleep) +{ + zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); + mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + + mutex_enter(&t->zthr_lock); + t->zthr_checkfunc = checkfunc; + t->zthr_func = func; + t->zthr_arg = arg; + t->zthr_wait_time = max_sleep; + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + mutex_exit(&t->zthr_lock); + + return (t); +} + +void +zthr_destroy(zthr_t *t) +{ + VERIFY3P(t->zthr_thread, ==, NULL); + mutex_destroy(&t->zthr_lock); + cv_destroy(&t->zthr_cv); + kmem_free(t, sizeof (*t)); +} + +/* + * Note: If the zthr is not sleeping and misses the wakeup + * (e.g it is running its ZTHR function), it will check if + * there is work to do before going to sleep using its checker + * function [see ZTHR state transition in ZTHR block comment]. + * Thus, missing the wakeup still yields the expected behavior. + */ +void +zthr_wakeup(zthr_t *t) +{ + mutex_enter(&t->zthr_lock); + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); +} + +/* + * Note: If the zthr is not running (e.g. has been cancelled + * already), this is a no-op. + */ +int +zthr_cancel(zthr_t *t) +{ + int rc = 0; + + mutex_enter(&t->zthr_lock); + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + t->zthr_cancel = B_TRUE; + while (t->zthr_thread != NULL) + cv_wait(&t->zthr_cv, &t->zthr_lock); + t->zthr_cancel = B_FALSE; + rc = t->zthr_rc; + mutex_exit(&t->zthr_lock); + + return (rc); +} + +void +zthr_resume(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, NULL); + + mutex_enter(&t->zthr_lock); + + ASSERT3P(&t->zthr_checkfunc, !=, NULL); + ASSERT3P(&t->zthr_func, !=, NULL); + ASSERT(!t->zthr_cancel); + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&t->zthr_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread has signal it to stop running. + * + * returns TRUE if we are in the middle of trying to cancel + * this thread. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_iscancelled(zthr_t *t) +{ + boolean_t cancelled; + + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_lock); + cancelled = t->zthr_cancel; + mutex_exit(&t->zthr_lock); + + return (cancelled); +} + +boolean_t +zthr_isrunning(zthr_t *t) +{ + boolean_t running; + + mutex_enter(&t->zthr_lock); + running = (t->zthr_thread != NULL); + mutex_exit(&t->zthr_lock); + + return (running); +}