openzfs · ahrens · Oct 16, 2018 · Oct 8, 2018 · Oct 9, 2018 · Oct 10, 2018
diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h
@@ -53,6 +53,7 @@ typedef struct uio {
 	int		uio_iovcnt;
 	offset_t	uio_loffset;
 	uio_seg_t	uio_segflg;
+	boolean_t	uio_fault_disable;
 	uint16_t	uio_fmode;
 	uint16_t	uio_extflg;
 	offset_t	uio_limit;

diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
@@ -42,7 +42,7 @@
 #include <sys/uio.h>
 
 extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
-extern void uio_prefaultpages(ssize_t, uio_t *);
+extern int uio_prefaultpages(ssize_t, uio_t *);
 extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
 extern void uioskip(uio_t *, size_t);
 

diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
@@ -52,6 +52,7 @@
 #include <sys/sysmacros.h>
 #include <sys/strings.h>
 #include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
 				if (copy_to_user(iov->iov_base+skip, p, cnt))
 					return (EFAULT);
 			} else {
-				if (copy_from_user(p, iov->iov_base+skip, cnt))
-					return (EFAULT);
+				if (uio->uio_fault_disable) {
+					if (!access_ok(VERIFY_READ,
+					    (iov->iov_base + skip), cnt)) {
+						return (EFAULT);
+					}
+
+					pagefault_disable();
+					if (__copy_from_user_inatomic(p,
+					    (iov->iov_base + skip), cnt)) {
+						pagefault_enable();
+						return (EFAULT);
+					}
+					pagefault_enable();
+				} else {
+					if (copy_from_user(p,
+					    (iov->iov_base + skip), cnt))
+						return (EFAULT);
+				}
 			}
 			break;
 		case UIO_SYSSPACE:
@@ -158,7 +175,7 @@ EXPORT_SYMBOL(uiomove);
  * error will terminate the process as this is only a best attempt to get
  * the pages resident.
  */
-void
+int
 uio_prefaultpages(ssize_t n, struct uio *uio)
 {
 	const struct iovec *iov;
@@ -172,7 +189,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
 	switch (uio->uio_segflg) {
 		case UIO_SYSSPACE:
 		case UIO_BVEC:
-			return;
+			return (0);
 		case UIO_USERSPACE:
 		case UIO_USERISPACE:
 			break;
@@ -196,7 +213,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
 		p = iov->iov_base + skip;
 		while (cnt) {
 			if (fuword8((uint8_t *)p, &tmp))
-				return;
+				return (EFAULT);
 			incr = MIN(cnt, PAGESIZE);
 			p += incr;
 			cnt -= incr;
@@ -206,8 +223,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
 		 */
 		p--;
 		if (fuword8((uint8_t *)p, &tmp))
-			return;
+			return (EFAULT);
 	}
+
+	return (0);
 }
 EXPORT_SYMBOL(uio_prefaultpages);
 

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
@@ -650,7 +650,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		xuio = (xuio_t *)uio;
 	else
 #endif
-		uio_prefaultpages(MIN(n, max_blksz), uio);
+		if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EFAULT));
+		}
 
 	rl_t	 *rl;
 
@@ -809,8 +812,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
+			uio->uio_fault_disable = B_TRUE;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
+			if (error == EFAULT) {
+				dmu_tx_commit(tx);
+				if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+					break;
+				}
+				continue;
+			} else if (error != 0) {
+				dmu_tx_abort(tx);
+				break;
+			}
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
@@ -910,8 +924,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
-		if (!xuio && n > 0)
-			uio_prefaultpages(MIN(n, max_blksz), uio);
+		if (!xuio && n > 0) {
+			if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+				error = EFAULT;
+				break;
+			}
+		}
 	}
 
 	zfs_inode_update(zp);
@@ -4636,13 +4654,22 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	}
 #endif
 
+top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	error = dmu_tx_assign(tx, TXG_WAIT);
+	boolean_t waited = B_FALSE;
+	error = dmu_tx_assign(tx,
+	    waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT);
 	if (error) {
+		if (error == ERESTART && waited == B_FALSE) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
 		dmu_tx_abort(tx);
 		goto out;
 	}

diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
@@ -31,74 +31,132 @@
 #include <string.h>
 #include <sys/mman.h>
 #include <pthread.h>
+#include <errno.h>
+#include <err.h>
 
 /*
  * --------------------------------------------------------------------
- * Bug Id: 5032643
+ * Bug Issue Id: #7512
+ * The bug time sequence:
+ * 1. context #1, zfs_write assign a txg "n".
+ * 2. In the same process, context #2, mmap page fault (which means the mm_sem
+ *    is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
+ *    txg "n" completed.
+ * 3. context #1 call uiomove to write, however page fault is occurred in
+ *    uiomove, which means it need mm_sem, but mm_sem is hold by
+ *    context #2, so it stuck and can't complete, then txg "n" will not
+ *    complete.
  *
- * Simply writing to a file and mmaping that file at the same time can
- * result in deadlock.  Nothing perverse like writing from the file's
- * own mapping is required.
+ * So context #1 and context #2 trap into the "dead lock".
  * --------------------------------------------------------------------
  */
 
+#define	NORMAL_WRITE_TH_NUM	2
+
 static void *
-mapper(void *fdp)
+normal_writer(void *filename)
 {
-	void *addr;
-	int fd = *(int *)fdp;
+	char *file_path = filename;
+	int fd = -1;
+	ssize_t write_num = 0;
+	int page_size = getpagesize();
 
-	if ((addr =
-	    mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
+	fd = open(file_path, O_RDWR | O_CREAT, 0777);
+	if (fd == -1) {
+		err(1, "failed to open %s", file_path);
 	}
-	for (;;) {
-		if (mmap(addr, 8192, PROT_READ,
-		    MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
-			perror("mmap");
-			exit(1);
+
+	char *buf = malloc(1);
+	while (1) {
+		write_num = write(fd, buf, 1);
+		if (write_num == 0) {
+			err(1, "write failed!");
+			break;
 		}
+		lseek(fd, page_size, SEEK_CUR);
+	}
+
+	if (buf) {
+		free(buf);
 	}
-	/* NOTREACHED */
-	return ((void *)1);
 }
 
-int
-main(int argc, char **argv)
+static void *
+map_writer(void *filename)
 {
-	int fd;
-	char buf[1024];
-	pthread_t tid;
+	int fd = -1;
+	int ret = 0;
+	char *buf = NULL;
+	int page_size = getpagesize();
+	int op_errno = 0;
+	char *file_path = filename;
 
-	memset(buf, 'a', sizeof (buf));
+	while (1) {
+		ret = access(file_path, F_OK);
+		if (ret) {
+			op_errno = errno;
+			if (op_errno == ENOENT) {
+				fd = open(file_path, O_RDWR | O_CREAT, 0777);
+				if (fd == -1) {
+					err(1, "open file failed");
+				}
 
-	if (argc != 2) {
-		(void) printf("usage: %s <file name>\n", argv[0]);
-		exit(1);
-	}
+				ret = ftruncate(fd, page_size);
+				if (ret == -1) {
+					err(1, "truncate file failed");
+				}
+			} else {
+				err(1, "access file failed!");
+			}
+		} else {
+			fd = open(file_path, O_RDWR, 0777);
+			if (fd == -1) {
+				err(1, "open file failed");
+			}
+		}
 
-	if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
-		perror("open");
-		exit(1);
+		if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, fd, 0)) == MAP_FAILED) {
+			err(1, "map file failed");
+		}
+
+		if (fd != -1)
+			close(fd);
+
+		char s[10] = {0, };
+		memcpy(buf, s, 10);
+		ret = munmap(buf, page_size);
+		if (ret != 0) {
+			err(1, "unmap file failed");
+		}
 	}
+}
 
-	(void) pthread_setconcurrency(2);
-	if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
-		perror("pthread_create");
-		close(fd);
+int
+main(int argc, char **argv)
+{
+	pthread_t map_write_tid;
+	pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
+	int i = 0;
+
+	if (argc != 3) {
+		(void) printf("usage: %s <normal write file name>"
+		    "<map write file name>\n", argv[0]);
 		exit(1);
 	}
-	for (;;) {
-		if (write(fd, buf, sizeof (buf)) == -1) {
-			perror("write");
-			close(fd);
-			exit(1);
+
+	for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
+		if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
+		    argv[1])) {
+			err(1, "pthread_create normal_writer failed.");
 		}
 	}
 
-	close(fd);
+	if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
+		err(1, "pthread_create map_writer failed.");
+	}
 
 	/* NOTREACHED */
+	pthread_join(map_write_tid, NULL);
 	return (0);
 }
diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
@@ -53,12 +53,14 @@ if ! is_mp; then
 fi
 
 log_must chmod 777 $TESTDIR
-mmapwrite $TESTDIR/test-write-file &
+mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
 PID_MMAPWRITE=$!
-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
+log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
+	 "pid: $PID_MMAPWRITE"
 log_must sleep 30
 
 log_must kill -9 $PID_MMAPWRITE
-log_must ls -l $TESTDIR/test-write-file
+log_must ls -l $TESTDIR/normal_write_file
+log_must ls -l $TESTDIR/map_write_file
 
 log_pass "write(2) a mmap(2)'ing file succeeded."