From 645889474d50ffc3965fb160f2389cd28520921e Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Mon, 8 Oct 2018 10:32:40 +0800 Subject: [PATCH 1/6] fix write IO hang. The bug time sequence: 1. context #1, `zfs_write` assign a txg "n". 2. In a same process, context #2, mmap page fault (which means the `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, and wait previous txg "n" completed. 3. context #1 call `uiomove` to write, however page fault is occurred in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by context #2, so it stuck and can't complete, then txg "n" will not complete. So context #1 and context #2 trap into the "dead lock". Signed-off-by: Grady Wong --- include/spl/sys/uio.h | 1 + module/zcommon/zfs_uio.c | 21 +++++++++++++++++++-- module/zfs/zfs_vnops.c | 20 +++++++++++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h index 64c452b8d17f..fac26079d7bc 100644 --- a/include/spl/sys/uio.h +++ b/include/spl/sys/uio.h @@ -53,6 +53,7 @@ typedef struct uio { int uio_iovcnt; offset_t uio_loffset; uio_seg_t uio_segflg; + boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; offset_t uio_limit; diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index af9716126f6d..9045e1ac218a 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -52,6 +52,7 @@ #include #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) if (copy_to_user(iov->iov_base+skip, p, cnt)) return (EFAULT); } else { - if (copy_from_user(p, iov->iov_base+skip, cnt)) - return (EFAULT); + if (uio->uio_fault_disable) { + if (!access_ok(VERIFY_READ, + (iov->iov_base + skip), cnt)) { + return (EFAULT); + } + + pagefault_disable(); + if (__copy_from_user_inatomic(p, + (iov->iov_base + skip), cnt)) { + pagefault_enable(); + return (EFAULT); + } + pagefault_enable(); + } else { + if (copy_from_user(p, + (iov->iov_base + skip), cnt)) + return (EFAULT); + } } break; case UIO_SYSSPACE: diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4e163e2e3fe8..76c8e2510cf9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -809,8 +809,17 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = uio->uio_resid; + uio->uio_fault_disable = B_TRUE; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + if (error == EFAULT) { + dmu_tx_commit(tx); + uio_prefaultpages(MIN(n, max_blksz), uio); + continue; + } else if (error != 0) { + dmu_tx_abort(tx); + break; + } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -4636,13 +4645,22 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); + boolean_t waited = B_FALSE; + error = dmu_tx_assign(tx, + waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT); if (error) { + if (error == ERESTART && waited == B_FALSE) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } dmu_tx_abort(tx); goto out; } From 9f7221f2adbaa27205d650a6578c1386a2ea7437 Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Tue, 9 Oct 2018 16:05:47 +0800 Subject: [PATCH 2/6] return EFAULT in `uio_prefaultpages` Signed-off-by: Grady Wong --- include/sys/uio_impl.h | 2 +- module/zcommon/zfs_uio.c | 10 ++++++---- module/zfs/zfs_vnops.c | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index 37e283da0f8b..cfef0b95dbb9 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -42,7 +42,7 @@ #include extern int uiomove(void *, size_t, enum uio_rw, uio_t *); -extern void uio_prefaultpages(ssize_t, uio_t *); +extern int uio_prefaultpages(ssize_t, uio_t *); extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); extern void uioskip(uio_t *, size_t); diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index 9045e1ac218a..a2c1b5c3aafe 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -175,7 +175,7 @@ EXPORT_SYMBOL(uiomove); * error will terminate the process as this is only a best attempt to get * the pages resident. */ -void +int uio_prefaultpages(ssize_t n, struct uio *uio) { const struct iovec *iov; @@ -189,7 +189,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio) switch (uio->uio_segflg) { case UIO_SYSSPACE: case UIO_BVEC: - return; + return (0); case UIO_USERSPACE: case UIO_USERISPACE: break; @@ -213,7 +213,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio) p = iov->iov_base + skip; while (cnt) { if (fuword8((uint8_t *)p, &tmp)) - return; + return (EFAULT); incr = MIN(cnt, PAGESIZE); p += incr; cnt -= incr; @@ -223,8 +223,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio) */ p--; if (fuword8((uint8_t *)p, &tmp)) - return; + return (EFAULT); } + + return (0); } EXPORT_SYMBOL(uio_prefaultpages); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 76c8e2510cf9..b85d98998dd7 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -650,7 +650,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) xuio = (xuio_t *)uio; else #endif - uio_prefaultpages(MIN(n, max_blksz), uio); + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + return (SET_ERROR(EFAULT)); + } rl_t *rl; @@ -814,7 +816,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) uio, nbytes, tx); if (error == EFAULT) { dmu_tx_commit(tx); - uio_prefaultpages(MIN(n, max_blksz), uio); + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + break; + } continue; } else if (error != 0) { dmu_tx_abort(tx); @@ -919,8 +923,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ASSERT(tx_bytes == nbytes); n -= nbytes; - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); + if (!xuio && n > 0) { + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = EFAULT; + break; + } + } } zfs_inode_update(zp); From 8e221f6e5bcd878f64f419c671ed2767a3a9746b Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Thu, 11 Oct 2018 01:20:50 +0800 Subject: [PATCH 3/6] update mmapwrite unittest. Signed-off-by: Grady Wong --- module/zfs/zfs_vnops.c | 1 + tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 140 +++++++++++++----- .../functional/mmap/mmap_write_001_pos.ksh | 8 +- 3 files changed, 105 insertions(+), 44 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b85d98998dd7..c183fd3d8036 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -651,6 +651,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) else #endif if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c index 190d31af3aaa..b9915d5d31eb 100644 --- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c +++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c @@ -31,74 +31,132 @@ #include #include #include +#include +#include /* * -------------------------------------------------------------------- - * Bug Id: 5032643 + * Bug Issue Id: #7512 + * The bug time sequence: + * 1. context #1, zfs_write assign a txg "n". + * 2. In the same process, context #2, mmap page fault (which means the mm_sem + * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous + * txg "n" completed. + * 3. context #1 call uiomove to write, however page fault is occurred in + * uiomove, which means it need mm_sem, but mm_sem is hold by + * context #2, so it stuck and can't complete, then txg "n" will not + * complete. * - * Simply writing to a file and mmaping that file at the same time can - * result in deadlock. Nothing perverse like writing from the file's - * own mapping is required. + * So context #1 and context #2 trap into the "dead lock". * -------------------------------------------------------------------- */ +#define NORMAL_WRITE_TH_NUM 2 + static void * -mapper(void *fdp) +normal_writer(void *filename) { - void *addr; - int fd = *(int *)fdp; + char *file_path = filename; + int fd = -1; + ssize_t write_num = 0; + int page_size = getpagesize(); - if ((addr = - mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) { - perror("mmap"); - exit(1); + fd = open(file_path, O_RDWR | O_CREAT, 0777); + if (fd == -1) { + err(1, "failed to open %s", file_path); } - for (;;) { - if (mmap(addr, 8192, PROT_READ, - MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) { - perror("mmap"); - exit(1); + + char *buf = malloc(1); + while (1) { + write_num = write(fd, buf, 1); + if (write_num == 0) { + err(1, "write failed!"); + break; } + lseek(fd, page_size, SEEK_CUR); + } + + if (buf) { + free(buf); } - /* NOTREACHED */ - return ((void *)1); } -int -main(int argc, char **argv) +static void * +map_writer(void *filename) { - int fd; - char buf[1024]; - pthread_t tid; + int fd = -1; + int ret = 0; + char *buf = NULL; + int page_size = getpagesize(); + int op_errno = 0; + char *file_path = filename; - memset(buf, 'a', sizeof (buf)); + while (1) { + ret = access(file_path, F_OK); + if (ret) { + op_errno = errno; + if (op_errno == ENOENT) { + fd = open(file_path, O_RDWR | O_CREAT, 0777); + if (fd == -1) { + err(1, "open file failed"); + } - if (argc != 2) { - (void) printf("usage: %s \n", argv[0]); - exit(1); - } + ret = ftruncate(fd, page_size); + if (ret == -1) { + err(1, "truncate file failed"); + } + } else { + err(1, "access file failed!"); + } + } else { + fd = open(file_path, O_RDWR, 0777); + if (fd == -1) { + err(1, "open file failed"); + } + } - if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) { - perror("open"); - exit(1); + if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0)) == MAP_FAILED) { + err(1, "map file failed"); + } + + if (fd != -1) + close(fd); + + char s[10] = {0, }; + memcpy(buf, s, 10); + ret = munmap(buf, page_size); + if (ret != 0) { + err(1, "unmap file failed"); + } } +} - (void) pthread_setconcurrency(2); - if (pthread_create(&tid, NULL, mapper, &fd) != 0) { - perror("pthread_create"); - close(fd); +int +main(int argc, char **argv) +{ + pthread_t map_write_tid; + pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM]; + int i = 0; + + if (argc != 3) { + (void) printf("usage: %s " + "\n", argv[0]); exit(1); } - for (;;) { - if (write(fd, buf, sizeof (buf)) == -1) { - perror("write"); - close(fd); - exit(1); + + for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) { + if (pthread_create(&normal_write_tid[i], NULL, normal_writer, + argv[1])) { + err(1, "pthread_create normal_writer failed."); } } - close(fd); + if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) { + err(1, "pthread_create map_writer failed."); + } /* NOTREACHED */ + pthread_join(map_write_tid, NULL); return (0); } diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh index 1eda971041dc..24150b827f8f 100755 --- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh @@ -53,12 +53,14 @@ if ! is_mp; then fi log_must chmod 777 $TESTDIR -mmapwrite $TESTDIR/test-write-file & +mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file & PID_MMAPWRITE=$! -log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE" +log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\ + "pid: $PID_MMAPWRITE" log_must sleep 30 log_must kill -9 $PID_MMAPWRITE -log_must ls -l $TESTDIR/test-write-file +log_must ls -l $TESTDIR/normal_write_file +log_must ls -l $TESTDIR/map_write_file log_pass "write(2) a mmap(2)'ing file succeeded." From 102619e813a7a05b73c58ac709af8304476f0753 Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Sat, 13 Oct 2018 10:41:54 +0800 Subject: [PATCH 4/6] Explain why zfs_dirty_inode need TXG_WAIT Signed-off-by: Grady Wong --- module/zfs/zfs_vnops.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c183fd3d8036..a362e3b5bddf 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -4656,16 +4656,18 @@ zfs_dirty_inode(struct inode *ip, int flags) top: tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - boolean_t waited = B_FALSE; - error = dmu_tx_assign(tx, - waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT); + /* + * Despite this function allows an error to be returned, it's called + * from zpl_dirty_inode() which is a Linux VFS callback functions + * (.dirty_inode) which must always succeed, so we have to assign a + * txg with TXG_NOTHROTTLE plus TX_WAIT. + */ + error = dmu_tx_assign(tx, TXG_NOTHROTTLE | TXG_WAIT); if (error) { - if (error == ERESTART && waited == B_FALSE) { - waited = B_TRUE; + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; From cfce4c24a432d7cba967e0dca928bf4ea80c4909 Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Tue, 16 Oct 2018 04:46:53 +0800 Subject: [PATCH 5/6] Restore the zfs_dirty_inode Signed-off-by: Grady Wong --- module/zfs/zfs_vnops.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a362e3b5bddf..a896ea553242 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -4654,24 +4654,13 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif -top: tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - /* - * Despite this function allows an error to be returned, it's called - * from zpl_dirty_inode() which is a Linux VFS callback functions - * (.dirty_inode) which must always succeed, so we have to assign a - * txg with TXG_NOTHROTTLE plus TX_WAIT. - */ - error = dmu_tx_assign(tx, TXG_NOTHROTTLE | TXG_WAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); goto out; } From 9cc361347fa6cd92c5aafd34cb152a3f9a750d43 Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Tue, 16 Oct 2018 08:31:26 +0800 Subject: [PATCH 6/6] can't abort tx when write error. Signed-off-by: Grady Wong --- module/zfs/zfs_vnops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a896ea553242..63d5609c8a4b 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -822,7 +822,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) } continue; } else if (error != 0) { - dmu_tx_abort(tx); + dmu_tx_commit(tx); break; } tx_bytes -= uio->uio_resid;