diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bc7046fdced8..a78b233d7889 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -778,6 +778,7 @@ void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index 37e283da0f8b..c7dc47808c3d 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -42,6 +42,7 @@ #include extern int uiomove(void *, size_t, enum uio_rw, uio_t *); +extern int uiomove_write_no_pagefault(void *p, size_t n, struct uio *uio); extern void uio_prefaultpages(ssize_t, uio_t *); extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); extern void uioskip(uio_t *, size_t); diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index af9716126f6d..42aefa441870 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -52,6 +52,7 @@ #include #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -150,6 +151,55 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) } EXPORT_SYMBOL(uiomove); +int +uiomove_write_no_pagefault(void *p, size_t n, struct uio *uio) +{ + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + ulong_t cnt; + + ASSERT(uio->uio_segflg != UIO_BVEC); + + while (n && uio->uio_resid) { + cnt = MIN(iov->iov_len - skip, n); + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + pagefault_disable(); + if (copy_from_user(p, iov->iov_base+skip, + cnt)) { + pagefault_enable(); + return (EFAULT); + } + pagefault_enable(); + break; + case UIO_SYSSPACE: + bcopy(iov->iov_base + skip, p, cnt); + break; + default: + ASSERT(0); + } + skip += cnt; + if (skip == iov->iov_len) { + skip = 0; + uio->uio_iov = (++iov); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); + +} +EXPORT_SYMBOL(uiomove_write_no_pagefault); + #define fuword8(uptr, vptr) get_user((*vptr), (uptr)) /* diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8779eb3586ca..452307c06463 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1523,8 +1523,8 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) * to lock the pages in memory, so that uiomove won't * block. */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = uiomove_write_no_pagefault((char *)db->db_data + bufoff, + tocpy, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index c268f3c40465..e835cd19c59c 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1040,6 +1040,40 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) return (0); } +int +dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + dmu_tx_hold_t *txh; + + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ + for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_txg = 0; + err = dmu_tx_assign(tx, txg_how); + + return (err); +} + void dmu_tx_wait(dmu_tx_t *tx) { diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4e163e2e3fe8..df13eae0da96 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -808,9 +808,32 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { + int reassign_error = 0; tx_bytes = uio->uio_resid; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); + do { + error = dmu_write_uio_dbuf( + sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + if (error) { + if (error == EFAULT) { + uio_prefaultpages(MIN(n, + max_blksz), uio); + reassign_error = + dmu_tx_reassign(tx, + TXG_WAIT); + if (reassign_error) { + break; + } + } + } + } while (error == EFAULT); + + if (reassign_error) { + dmu_tx_abort(tx); + if (abuf != NULL) { + dmu_return_arcbuf(abuf); + } + break; + } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes;