Skip to content

Commit

Permalink
fix write IO hang.
Browse files Browse the repository at this point in the history
The bug time sequence:
1. context #1, `zfs_write` assign a txg "n".
2. In a same process, context #2, mmap page fault (which means the
   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
   and wait previous txg "n" completed.
3. context #1 call `uiomove` to write, however page fault is occurred
   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
   context #2, so it stuck and can't complete,  then txg "n" will
   not complete.

So context #1 and context #2 trap into the "dead lock".

Signed-off-by: Grady Wong <grady.w@xtaotech.com>
  • Loading branch information
Grady Wong committed Sep 21, 2018
1 parent 7522a26 commit 629bcae
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 4 deletions.
1 change: 1 addition & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,7 @@ void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
int dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_mark_netfree(dmu_tx_t *tx);
Expand Down
1 change: 1 addition & 0 deletions include/sys/uio_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <sys/uio.h>

extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
extern int uiomove_write_no_pagefault(void *p, size_t n, struct uio *uio);
extern void uio_prefaultpages(ssize_t, uio_t *);
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
extern void uioskip(uio_t *, size_t);
Expand Down
50 changes: 50 additions & 0 deletions module/zcommon/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include <sys/sysmacros.h>
#include <sys/strings.h>
#include <linux/kmap_compat.h>
#include <linux/uaccess.h>

/*
* Move "n" bytes at byte address "p"; "rw" indicates the direction
Expand Down Expand Up @@ -150,6 +151,55 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
}
EXPORT_SYMBOL(uiomove);

int
uiomove_write_no_pagefault(void *p, size_t n, struct uio *uio)
{
const struct iovec *iov = uio->uio_iov;
size_t skip = uio->uio_skip;
ulong_t cnt;

ASSERT(uio->uio_segflg != UIO_BVEC);

while (n && uio->uio_resid) {
cnt = MIN(iov->iov_len - skip, n);
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
/*
* p = kernel data pointer
* iov->iov_base = user data pointer
*/
pagefault_disable();
if (copy_from_user(p, iov->iov_base+skip,
cnt)) {
pagefault_enable();
return (EFAULT);
}
pagefault_enable();
break;
case UIO_SYSSPACE:
bcopy(iov->iov_base + skip, p, cnt);
break;
default:
ASSERT(0);
}
skip += cnt;
if (skip == iov->iov_len) {
skip = 0;
uio->uio_iov = (++iov);
uio->uio_iovcnt--;
}
uio->uio_skip = skip;
uio->uio_resid -= cnt;
uio->uio_loffset += cnt;
p = (caddr_t)p + cnt;
n -= cnt;
}
return (0);

}
EXPORT_SYMBOL(uiomove_write_no_pagefault);

#define fuword8(uptr, vptr) get_user((*vptr), (uptr))

/*
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1523,8 +1523,8 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
* to lock the pages in memory, so that uiomove won't
* block.
*/
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
err = uiomove_write_no_pagefault((char *)db->db_data + bufoff,
tocpy, uio);

if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
Expand Down
34 changes: 34 additions & 0 deletions module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,40 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
return (0);
}

int
dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how)
{
int err;
dmu_tx_hold_t *txh;

/*
* Walk the transaction's hold list, removing the hold on the
* associated dnode, and notifying waiters if the refcount drops to 0.
*/
for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;

if (dn == NULL)
continue;
mutex_enter(&dn->dn_mtx);
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);

if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
dn->dn_assigned_txg = 0;
cv_broadcast(&dn->dn_notxholds);
}
mutex_exit(&dn->dn_mtx);
}

txg_rele_to_sync(&tx->tx_txgh);

tx->tx_txg = 0;
err = dmu_tx_assign(tx, txg_how);

return (err);
}

void
dmu_tx_wait(dmu_tx_t *tx)
{
Expand Down
27 changes: 25 additions & 2 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -808,9 +808,32 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)

ssize_t tx_bytes;
if (abuf == NULL) {
int reassign_error = 0;
tx_bytes = uio->uio_resid;
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
do {
error = dmu_write_uio_dbuf(
sa_get_db(zp->z_sa_hdl), uio, nbytes, tx);
if (error) {
if (error == EFAULT) {
uio_prefaultpages(MIN(n,
max_blksz), uio);
reassign_error =
dmu_tx_reassign(tx,
TXG_WAIT);
if (reassign_error) {
break;
}
}
}
} while (error == EFAULT);

if (reassign_error) {
dmu_tx_abort(tx);
if (abuf != NULL) {
dmu_return_arcbuf(abuf);
}
break;
}
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
Expand Down

0 comments on commit 629bcae

Please sign in to comment.