From 17811cc3a543a9ba8a60183e105a23094d4965f5 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 11 Nov 2025 02:29:44 +0500 Subject: [PATCH] Fix snapshot automount expiry cancellation deadlock A deadlock occurs when snapshot expiry tasks are cancelled while holding locks. The snapshot expiry task (snapentry_expire) spawns an umount process and waits for it to complete. Concurrently, ARC memory pressure triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the expiry task while holding locks. The umount process spawned by the expiry task blocks trying to acquire locks held by arc_prune, which is blocked waiting for the expiry task to complete. This creates a circular dependency: expiry task waits for umount, umount waits for arc_prune, arc_prune waits for expiry task. Fix by adding non-blocking cancellation support to taskq_cancel_id(). The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to reschedule the unmount, which needs to cancel any existing expiry task. It now uses non-blocking cancellation to avoid waiting while holding locks, breaking the deadlock by returning immediately when the task is already running. The per-entry se_taskqid_lock has been removed, with all taskqid operations now protected by the global zfs_snapshot_lock held as WRITER. Additionally, an se_in_umount flag prevents recursive waits when zfsctl_destroy() is called during unmount. The taskqid is now only cleared by the caller on successful cancellation; running tasks clear their own taskqid upon completion. Signed-off-by: Ameer Hamza --- include/os/freebsd/spl/sys/taskq.h | 2 +- include/os/linux/spl/sys/taskq.h | 2 +- lib/libspl/include/sys/taskq.h | 2 +- lib/libspl/taskq.c | 4 +-- lib/libzfs/libzfs.abi | 1 + lib/libzfs_core/libzfs_core.abi | 1 + module/os/freebsd/spl/spl_taskq.c | 13 ++++++-- module/os/linux/spl/spl-kmem-cache.c | 2 +- module/os/linux/spl/spl-taskq.c | 27 +++++++++++---- module/os/linux/zfs/zfs_ctldir.c | 50 +++++++++++++--------------- module/os/linux/zfs/zfs_dir.c | 3 +- module/zfs/dmu_objset.c | 3 +- module/zfs/spa.c | 6 ++-- module/zfs/zfs_fm.c | 3 +- 14 files changed, 71 insertions(+), 48 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 40f1a8ec2c57..949ea4dfaba1 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -107,7 +107,7 @@ extern void taskq_destroy(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait(taskq_t *); -extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); void taskq_suspend(taskq_t *); diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index c9b2bc994c8c..108b4fbeec8d 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -198,7 +198,7 @@ extern void taskq_destroy(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait(taskq_t *); -extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); diff --git a/lib/libspl/include/sys/taskq.h b/lib/libspl/include/sys/taskq.h index fbe3f388c05f..bcacf62c25dd 100644 --- a/lib/libspl/include/sys/taskq.h +++ b/lib/libspl/include/sys/taskq.h @@ -112,7 +112,7 @@ extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); -extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); diff --git a/lib/libspl/taskq.c b/lib/libspl/taskq.c index 043f70225551..21b406ecbd1d 100644 --- a/lib/libspl/taskq.c +++ b/lib/libspl/taskq.c @@ -398,9 +398,9 @@ taskq_of_curthread(void) } int -taskq_cancel_id(taskq_t *tq, taskqid_t id) +taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait) { - (void) tq, (void) id; + (void) tq, (void) id, (void) wait; return (ENOENT); } diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f481b6221e4d..351c900ff41e 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2279,6 +2279,7 @@ + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 238151d432f1..6ccb7890d90e 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -2123,6 +2123,7 @@ + diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index 78d9c8257208..3caa4c506183 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -351,7 +351,7 @@ taskq_free(taskq_ent_t *task) } int -taskq_cancel_id(taskq_t *tq, taskqid_t tid) +taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait) { uint32_t pend; int rc; @@ -362,12 +362,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) if (ent->tqent_type == NORMAL_TASK) { rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); - if (rc == EBUSY) + if (rc == EBUSY && wait) taskqueue_drain(tq->tq_queue, &ent->tqent_task); } else { rc = taskqueue_cancel_timeout(tq->tq_queue, &ent->tqent_timeout_task, &pend); - if (rc == EBUSY) { + if (rc == EBUSY && wait) { taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task); } @@ -381,6 +381,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) } /* Free the extra reference we added with taskq_lookup. */ taskq_free(ent); + + /* + * If task was running and we didn't wait, return EBUSY. + * Otherwise return 0 if cancelled or ENOENT if not found. + */ + if (rc == EBUSY && !wait) + return (EBUSY); return (pend ? 0 : ENOENT); } diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 22e4ed169d03..5594b2f80c02 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -840,7 +840,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) id = skc->skc_taskqid; spin_unlock(&skc->skc_lock); - taskq_cancel_id(spl_kmem_cache_taskq, id); + taskq_cancel_id(spl_kmem_cache_taskq, id, B_TRUE); /* * Wait until all current callers complete, this is mainly diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 00ff789265c6..107faa4eca31 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -598,13 +598,22 @@ taskq_of_curthread(void) EXPORT_SYMBOL(taskq_of_curthread); /* - * Cancel an already dispatched task given the task id. Still pending tasks - * will be immediately canceled, and if the task is active the function will - * block until it completes. Preallocated tasks which are canceled must be - * freed by the caller. + * Cancel a dispatched task. Pending tasks are cancelled immediately. + * If the task is running, behavior depends on wait parameter: + * - wait=B_TRUE: Block until task completes + * - wait=B_FALSE: Return EBUSY immediately + * + * Return values: + * 0 - Cancelled before execution. Caller must release resources. + * EBUSY - Task running (wait=B_FALSE only). Will self-cleanup. + * ENOENT - Not found, or completed after waiting. Already cleaned up. + * + * Note: wait=B_TRUE returns ENOENT (not EBUSY) after waiting because + * the task no longer exists. This distinguishes "cancelled before run" + * from "completed naturally" for proper resource management. */ int -taskq_cancel_id(taskq_t *tq, taskqid_t id) +taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait) { taskq_ent_t *t; int rc = ENOENT; @@ -650,8 +659,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id) spin_unlock_irqrestore(&tq->tq_lock, flags); if (t == ERR_PTR(-EBUSY)) { - taskq_wait_id(tq, id); - rc = EBUSY; + if (wait) { + taskq_wait_id(tq, id); + rc = ENOENT; /* Completed, no longer exists */ + } else { + rc = EBUSY; /* Still running */ + } } return (rc); diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index fb4de50480a3..1ac60119fdcf 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -120,7 +120,6 @@ typedef struct { spa_t *se_spa; /* pool spa */ uint64_t se_objsetid; /* snapshot objset id */ struct dentry *se_root_dentry; /* snapshot root dentry */ - krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */ taskqid_t se_taskqid; /* scheduled unmount taskqid */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ @@ -147,7 +146,6 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa, se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; se->se_taskqid = TASKQID_INVALID; - rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL); zfs_refcount_create(&se->se_refcount); @@ -164,7 +162,6 @@ zfsctl_snapshot_free(zfs_snapentry_t *se) zfs_refcount_destroy(&se->se_refcount); kmem_strfree(se->se_name); kmem_strfree(se->se_path); - rw_destroy(&se->se_taskqid_lock); kmem_free(se, sizeof (zfs_snapentry_t)); } @@ -340,17 +337,15 @@ snapentry_expire(void *data) return; } - rw_enter(&se->se_taskqid_lock, RW_WRITER); - se->se_taskqid = TASKQID_INVALID; - rw_exit(&se->se_taskqid_lock); (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); - zfsctl_snapshot_rele(se); /* - * Reschedule the unmount if the zfs_snapentry_t wasn't removed. + * Clear taskqid and reschedule if the snapshot wasn't removed. * This can occur when the snapshot is busy. */ - rw_enter(&zfs_snapshot_lock, RW_READER); + rw_enter(&zfs_snapshot_lock, RW_WRITER); + se->se_taskqid = TASKQID_INVALID; + zfsctl_snapshot_rele(se); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); zfsctl_snapshot_rele(se); @@ -367,17 +362,17 @@ static void zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { int err = 0; - rw_enter(&se->se_taskqid_lock, RW_WRITER); - err = taskq_cancel_id(system_delay_taskq, se->se_taskqid); + + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + + err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, B_FALSE); /* - * if we get ENOENT, the taskq couldn't be found to be - * canceled, so we can just mark it as invalid because - * it's already gone. If we got EBUSY, then we already - * blocked until it was gone _anyway_, so we don't care. + * Clear taskqid only if we successfully cancelled before execution. + * For ENOENT, task already cleared it. For EBUSY, task will clear + * it when done. */ - se->se_taskqid = TASKQID_INVALID; - rw_exit(&se->se_taskqid_lock); if (err == 0) { + se->se_taskqid = TASKQID_INVALID; zfsctl_snapshot_rele(se); } } @@ -388,12 +383,11 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { + ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); if (delay <= 0) return; - zfsctl_snapshot_hold(se); - rw_enter(&se->se_taskqid_lock, RW_WRITER); /* * If this condition happens, we managed to: * - dispatch once @@ -404,13 +398,12 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) * no problem. */ if (se->se_taskqid != TASKQID_INVALID) { - rw_exit(&se->se_taskqid_lock); - zfsctl_snapshot_rele(se); return; } + + zfsctl_snapshot_hold(se); se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); - rw_exit(&se->se_taskqid_lock); } /* @@ -425,7 +418,7 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) zfs_snapentry_t *se; int error = ENOENT; - rw_enter(&zfs_snapshot_lock, RW_READER); + rw_enter(&zfs_snapshot_lock, RW_WRITER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_delay_impl(se, delay); @@ -614,13 +607,18 @@ zfsctl_destroy(zfsvfs_t *zfsvfs) rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); - if (se != NULL) - zfsctl_snapshot_remove(se); - rw_exit(&zfs_snapshot_lock); if (se != NULL) { + zfsctl_snapshot_remove(se); + /* + * Don't wait if snapentry_expire task is calling + * umount, which may have resulted in this destroy + * call. Waiting would deadlock: snapentry_expire + * waits for umount while umount waits for task. + */ zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_rele(se); } + rw_exit(&zfs_snapshot_lock); } else if (zfsvfs->z_ctldir) { iput(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index e8de536606e2..7edea05f94e6 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -573,7 +573,8 @@ zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) if (zfsvfs->z_draining) { zfsvfs->z_drain_cancel = B_TRUE; taskq_cancel_id(dsl_pool_unlinked_drain_taskq( - dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); + dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task, + B_TRUE); zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8e6b569c2100..5a815b59e37b 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1452,7 +1452,8 @@ dmu_objset_upgrade_stop(objset_t *os) os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); - if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) { + if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id, + B_TRUE)) == 0) { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } txg_wait_synced(os->os_spa->spa_dsl_pool, 0); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 34de3f1d9525..c481070e1f2d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1934,7 +1934,7 @@ spa_deactivate(spa_t *spa) list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); - taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { @@ -10451,7 +10451,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); @@ -10508,7 +10508,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); - taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE); spa->spa_deadman_tqid = 0; /* diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 4a0d41c24eed..eb18296ec3f2 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1531,7 +1531,8 @@ zfs_ereport_taskq_fini(void) { mutex_enter(&recent_events_lock); if (recent_events_cleaner_tqid != 0) { - taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid, + B_TRUE); recent_events_cleaner_tqid = 0; } mutex_exit(&recent_events_lock);