68 changes: 33 additions & 35 deletions include/sys/zio.h
Expand Up @@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/

Expand Down Expand Up @@ -130,19 +130,16 @@ enum zio_compress {
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2

#define ZIO_PRIORITY_NOW (zio_priority_table[0])
#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
#define ZIO_PRIORITY_AGG (zio_priority_table[5])
#define ZIO_PRIORITY_FREE (zio_priority_table[6])
#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
#define ZIO_PRIORITY_TABLE_SIZE 12
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,

ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;

#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
Expand Down Expand Up @@ -198,7 +195,8 @@ enum zio_flag {
ZIO_FLAG_GODFATHER = 1 << 24,
ZIO_FLAG_NOPWRITE = 1 << 25,
ZIO_FLAG_REEXECUTED = 1 << 26,
ZIO_FLAG_FASTWRITE = 1 << 27
ZIO_FLAG_DELEGATED = 1 << 27,
ZIO_FLAG_FASTWRITE = 1 << 28
};

#define ZIO_FLAG_MUSTSUCCEED 0
Expand Down Expand Up @@ -238,8 +236,7 @@ enum zio_wait_type {

typedef void zio_done_func_t(zio_t *zio);

extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
extern char *zio_type_name[ZIO_TYPES];
extern const char *zio_type_name[ZIO_TYPES];

/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
Expand Down Expand Up @@ -381,7 +378,7 @@ struct zio {
zio_type_t io_type;
enum zio_child io_child_type;
int io_cmd;
uint8_t io_priority;
zio_priority_t io_priority;
uint8_t io_reexecute;
uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg;
Expand All @@ -396,7 +393,8 @@ struct zio {
zio_transform_t *io_transform_stack;

/* Callback info */
zio_done_func_t *io_ready;
zio_done_func_t *io_ready;
zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
Expand All @@ -414,13 +412,10 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;

uint64_t io_offset;
uint64_t io_deadline; /* expires at timestamp + deadline */
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_delta; /* vdev queue service delta */
uint64_t io_delay; /* vdev disk service delta (ticks) */
avl_node_t io_offset_node;
avl_node_t io_deadline_node;
avl_tree_t *io_vdev_tree;
avl_node_t io_queue_node;

/* Internal pipeline state */
enum zio_flag io_flags;
Expand All @@ -433,6 +428,7 @@ struct zio {
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t io_child_count;
uint64_t io_phys_children;
uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
Expand All @@ -458,16 +454,17 @@ extern zio_t *zio_root(spa_t *spa,

extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, const zbookmark_t *zb);
zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb);

extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, const zbookmark_t *zb);
zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb);

extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, zbookmark_t *zb);
zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb);

extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite);
Expand All @@ -479,17 +476,17 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
zio_done_func_t *done, void *private, enum zio_flag flags);

extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
zio_done_func_t *done, void *private, enum zio_flag flags);

extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);

extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);

extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, enum zio_flag flags);
Expand Down Expand Up @@ -520,11 +517,12 @@ extern void zio_vdev_free(void *buf);
extern void zio_resubmit_stage_async(void *);

extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type, int priority,
enum zio_flag flags, zio_done_func_t *done, void *private);
uint64_t offset, void *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private);

extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, int priority,
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);

extern void zio_vdev_io_bypass(zio_t *zio);
Expand Down
597 changes: 473 additions & 124 deletions man/man5/zfs-module-parameters.5

Large diffs are not rendered by default.

166 changes: 110 additions & 56 deletions module/zfs/arc.c
Expand Up @@ -134,6 +134,7 @@
#include <sys/arc.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
Expand Down Expand Up @@ -162,6 +163,12 @@ typedef enum arc_reclaim_strategy {
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
} arc_reclaim_strategy_t;

/*
* The number of iterations through arc_evict_*() before we
* drop & reacquire the lock.
*/
int arc_evict_iterations = 100;

/* number of seconds before growing cache again */
int zfs_arc_grow_retry = 5;

Expand All @@ -183,6 +190,11 @@ int zfs_arc_memory_throttle_disable = 1;
/* disable duplicate buffer eviction */
int zfs_disable_dup_eviction = 0;

/*
* If this percent of memory is free, don't throttle.
*/
int arc_lotsfree_percent = 10;

static int arc_dead;

/* expiration time for arc_no_grow */
Expand Down Expand Up @@ -519,6 +531,7 @@ typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
Expand Down Expand Up @@ -1253,7 +1266,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
uint64_t from_delta, to_delta;

ASSERT(MUTEX_HELD(hash_lock));
ASSERT(new_state != old_state);
ASSERT3P(new_state, !=, old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
Expand Down Expand Up @@ -1859,6 +1872,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
arc_buf_hdr_t marker = {{{ 0 }}};
int count = 0;

ASSERT(state == arc_mru || state == arc_mfu);

Expand All @@ -1882,6 +1897,33 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
if (recycle && ab->b_size != bytes &&
ab_prev && ab_prev->b_size == bytes)
continue;

/* ignore markers */
if (ab->b_spa == 0)
continue;

/*
* It may take a long time to evict all the bufs requested.
* To avoid blocking all arc activity, periodically drop
* the arcs_mtx and give other threads a chance to run
* before reacquiring the lock.
*
* If we are looking for a buffer to recycle, we are in
* the hot code path, so don't sleep.
*/
if (!recycle && count++ > arc_evict_iterations) {
list_insert_after(list, ab, &marker);
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&state->arcs_mtx);
kpreempt(KPREEMPT_SYNC);
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
count = 0;
continue;
}

hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
Expand Down Expand Up @@ -1963,27 +2005,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
ARCSTAT_INCR(arcstat_mutex_miss, missed);

/*
* We have just evicted some data into the ghost state, make
* sure we also adjust the ghost state size if necessary.
* Note: we have just evicted some data into the ghost state,
* potentially putting the ghost size over the desired size. Rather
* that evicting from the ghost list in this hot code path, leave
* this chore to the arc_reclaim_thread().
*/
if (arc_no_grow &&
arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
arc_mru_ghost->arcs_size - arc_c;

if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
int64_t todelete =
MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
arc_evict_ghost(arc_mru_ghost, 0, todelete,
ARC_BUFC_DATA);
} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
arc_mru_ghost->arcs_size +
arc_mfu_ghost->arcs_size - arc_c);
arc_evict_ghost(arc_mfu_ghost, 0, todelete,
ARC_BUFC_DATA);
}
}

return (stolen);
}
Expand All @@ -2002,13 +2028,16 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
int count = 0;

ASSERT(GHOST_STATE(state));
bzero(&marker, sizeof(marker));
top:
mutex_enter(&state->arcs_mtx);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
if (ab->b_type > ARC_BUFC_NUMTYPES)
panic("invalid ab=%p", (void *)ab);
if (spa && ab->b_spa != spa)
continue;

Expand All @@ -2020,6 +2049,23 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
continue;

/*
* It may take a long time to evict all the bufs requested.
* To avoid blocking all arc activity, periodically drop
* the arcs_mtx and give other threads a chance to run
* before reacquiring the lock.
*/
if (count++ > arc_evict_iterations) {
list_insert_after(list, ab, &marker);
mutex_exit(&state->arcs_mtx);
kpreempt(KPREEMPT_SYNC);
mutex_enter(&state->arcs_mtx);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
count = 0;
continue;
}
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
Expand Down Expand Up @@ -2055,8 +2101,9 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
mutex_enter(&state->arcs_mtx);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
} else
} else {
bufs_skipped += 1;
}
}
mutex_exit(&state->arcs_mtx);

Expand Down Expand Up @@ -3050,7 +3097,7 @@ arc_read_done(zio_t *zio)
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, int priority, int zio_flags, uint32_t *arc_flags,
void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
Expand Down Expand Up @@ -3702,6 +3749,18 @@ arc_write_ready(zio_t *zio)
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}

/*
* The SPA calls this callback for each physical write that happens on behalf
* of a logical write. See the comment in dbuf_write_physdone() for details.
*/
static void
arc_write_physdone(zio_t *zio)
{
arc_write_callback_t *cb = zio->io_private;
if (cb->awcb_physdone != NULL)
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
}

static void
arc_write_done(zio_t *zio)
{
Expand Down Expand Up @@ -3782,8 +3841,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
void *private, int priority, int zio_flags, const zbookmark_t *zb)
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
arc_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
Expand All @@ -3800,39 +3860,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
hdr->b_flags |= ARC_L2COMPRESS;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
callback->awcb_ready = ready;
callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;

zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
arc_write_ready, arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);

return (zio);
}

static int
arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory;

if (zfs_arc_memory_throttle_disable)
return (0);

/* Easily reclaimable memory (free + inactive + arc-evictable) */
available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();

if (available_memory <= zfs_write_limit_max) {
if (freemem <= physmem * arc_lotsfree_percent / 100) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
return (SET_ERROR(EAGAIN));
}

if (inflight_data > available_memory / 4) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
return (ERESTART);
}
#endif
return (0);
}
Expand All @@ -3850,15 +3901,6 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
int error;
uint64_t anon_size;

#ifdef ZFS_DEBUG
/*
* Once in a while, fail for no reason. Everything should cope.
*/
if (spa_get_random(10000) == 0) {
dprintf("forcing random failure\n");
return (ERESTART);
}
#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c) {
Expand All @@ -3878,7 +3920,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
if ((error = arc_memory_throttle(reserve, anon_size, txg)))
error = arc_memory_throttle(reserve, txg);
if (error != 0)
return (error);

/*
Expand Down Expand Up @@ -4075,11 +4118,24 @@ arc_init(void)
arc_dead = FALSE;
arc_warm = B_FALSE;

if (zfs_write_limit_max == 0)
zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
else
zfs_write_limit_shift = 0;
mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
/*
* Calculate maximum amount of dirty data per pool.
*
* If it has been set by a module parameter, take that.
* Otherwise, use a percentage of physical memory defined by
* zfs_dirty_data_max_percent (default 10%) with a cap at
* zfs_dirty_data_max_max (default 25% of physical memory).
*/
if (zfs_dirty_data_max_max == 0)
zfs_dirty_data_max_max = physmem * PAGESIZE *
zfs_dirty_data_max_max_percent / 100;

if (zfs_dirty_data_max == 0) {
zfs_dirty_data_max = physmem * PAGESIZE *
zfs_dirty_data_max_percent / 100;
zfs_dirty_data_max = MIN(zfs_dirty_data_max,
zfs_dirty_data_max_max);
}
}

void
Expand Down Expand Up @@ -4137,8 +4193,6 @@ arc_fini(void)
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
mutex_destroy(&arc_l2c_only->arcs_mtx);

mutex_destroy(&zfs_write_limit_lock);

buf_fini();

ASSERT(arc_loaned_bytes == 0);
Expand Down
68 changes: 54 additions & 14 deletions module/zfs/dbuf.c
Expand Up @@ -891,7 +891,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
atomic_inc_64(&zfs_free_range_recv_miss);
}

for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);

Expand Down Expand Up @@ -1238,6 +1238,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
dr->dr_accounted = db->db.db_size;
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
dr->dr_next = *drp;
Expand Down Expand Up @@ -1321,7 +1323,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dbuf_rele(parent, FTAG);

mutex_enter(&db->db_mtx);
/* possible race with dbuf_undirty() */
/*
* Since we've dropped the mutex, it's possible that
* dbuf_undirty() might have changed this out from under us.
*/
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
mutex_enter(&di->dt.di.dr_mtx);
Expand Down Expand Up @@ -1391,7 +1396,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)

ASSERT(db->db.db_size != 0);

/* XXX would be nice to fix up dn_towrite_space[] */
/*
* Any space we accounted for in dp_dirty_* will be cleaned up by
* dsl_pool_sync(). This is relatively rare so the discrepancy
* is not a big deal.
*/

*drp = dr->dr_next;

Expand Down Expand Up @@ -1571,7 +1580,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)

/*
* "Clear" the contents of this dbuf. This will mark the dbuf
* EVICTING and clear *most* of its references. Unfortunetely,
* EVICTING and clear *most* of its references. Unfortunately,
* when we are not holding the dn_dbufs_mtx, we can't clear the
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
* in this case. For callers from the DMU we will usually see:
Expand Down Expand Up @@ -1768,7 +1777,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_offset = 0;
} else {
int blocksize =
db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
Expand Down Expand Up @@ -1877,7 +1886,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
}

void
dbuf_prefetch(dnode_t *dn, uint64_t blkid)
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
{
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
Expand All @@ -1901,8 +1910,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)

if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
Expand All @@ -1911,7 +1918,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
dn->dn_object, 0, blkid);

(void) arc_read(NULL, dn->dn_objset->os_spa,
bp, NULL, NULL, priority,
bp, NULL, NULL, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
Expand Down Expand Up @@ -2647,6 +2654,38 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&db->db_mtx);
}

/*
* The SPA will call this callback several times for each zio - once
* for every physical child i/o (zio->io_phys_children times). This
* allows the DMU to monitor the progress of each logical i/o. For example,
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
* block. There may be a long delay before all copies/fragments are completed,
* so this callback allows us to retire dirty space gradually, as the physical
* i/os complete.
*/
/* ARGSUSED */
static void
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
{
dmu_buf_impl_t *db = arg;
objset_t *os = db->db_objset;
dsl_pool_t *dp = dmu_objset_pool(os);
dbuf_dirty_record_t *dr;
int delta = 0;

dr = db->db_data_pending;
ASSERT3U(dr->dr_txg, ==, zio->io_txg);

/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dsl_pool_sync()'s call to
* dsl_pool_undirty_space().
*/
delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}

/* ARGSUSED */
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
Expand Down Expand Up @@ -2741,6 +2780,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;

dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
}

Expand Down Expand Up @@ -2859,8 +2899,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ASSERT(db->db_state != DB_NOFILL);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
dbuf_write_override_ready, dbuf_write_override_done, dr,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
dbuf_write_override_ready, NULL, dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
Expand All @@ -2870,16 +2910,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
ASSERT(arc_released(data));
dr->dr_zio = arc_write(zio, os->os_spa, txg,
db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);
dbuf_write_physdone, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}

Expand Down
37 changes: 21 additions & 16 deletions module/zfs/dmu.c
Expand Up @@ -370,13 +370,11 @@ static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
hrtime_t start = 0;

ASSERT(length <= DMU_MAX_ACCESS);

Expand Down Expand Up @@ -404,9 +402,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG);

if (dn->dn_objset->os_dsl_dataset)
dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
start = gethrtime();
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
Expand All @@ -427,9 +422,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,

/* wait for async i/o */
err = zio_wait(zio);
/* track read overhead when we are in sync context */
if (dp && dsl_pool_sync_context(dp))
dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
Expand Down Expand Up @@ -511,12 +503,22 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}

/*
* Issue prefetch i/os for the given blocks.
*
* Note: The assumption is that we *know* these blocks will be needed
* almost immediately. Therefore, the prefetch i/os will be issued at
* ZIO_PRIORITY_SYNC_READ
*
* Note: indirect blocks and other metadata will be read synchronously,
* causing this function to block if they are not already cached.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
{
dnode_t *dn;
uint64_t blkid;
int nblks, i, err;
int nblks, err;

if (zfs_prefetch_disable)
return;
Expand All @@ -529,7 +531,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)

rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, blkid);
dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
rw_exit(&dn->dn_struct_rwlock);
return;
}
Expand All @@ -546,16 +548,18 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
P2ALIGN(offset, 1<<blkshift)) >> blkshift;
nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
P2ALIGN(offset, 1 << blkshift)) >> blkshift;
} else {
nblks = (offset < dn->dn_datablksz);
}

if (nblks != 0) {
int i;

blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++)
dbuf_prefetch(dn, blkid+i);
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
}

rw_exit(&dn->dn_struct_rwlock);
Expand Down Expand Up @@ -1559,7 +1563,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,

zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb));

return (0);
Expand Down Expand Up @@ -1699,8 +1703,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)

zio_nowait(arc_write(pio, os->os_spa, txg,
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb));
DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL, &zb));

return (0);
}
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dmu_objset.c
Expand Up @@ -1032,7 +1032,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
zio = arc_write(pio, os->os_spa, tx->tx_txg,
os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);

/*
Expand Down
207 changes: 197 additions & 10 deletions module/zfs/dmu_tx.c
Expand Up @@ -53,7 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 },
{ "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_write_limit", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
};

Expand All @@ -70,6 +71,7 @@ dmu_tx_create_dd(dsl_dir_t *dd)
offsetof(dmu_tx_hold_t, txh_node));
list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
tx->tx_start = gethrtime();
#ifdef DEBUG_DMU_TX
refcount_create(&tx->tx_space_written);
refcount_create(&tx->tx_space_freed);
Expand Down Expand Up @@ -614,6 +616,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
if (txh == NULL)
return;
dn = txh->txh_dnode;
dmu_tx_count_dnode(txh);

if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
Expand Down Expand Up @@ -931,6 +934,142 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
}
#endif

/*
* If we can't do 10 iops, something is wrong. Let us go ahead
* and hit zfs_dirty_data_max.
*/
hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */

/*
* We delay transactions when we've determined that the backend storage
* isn't able to accommodate the rate of incoming writes.
*
* If there is already a transaction waiting, we delay relative to when
* that transaction finishes waiting. This way the calculated min_time
* is independent of the number of threads concurrently executing
* transactions.
*
* If we are the only waiter, wait relative to when the transaction
* started, rather than the current time. This credits the transaction for
* "time already served", e.g. reading indirect blocks.
*
* The minimum time for a transaction to take is calculated as:
* min_time = scale * (dirty - min) / (max - dirty)
* min_time is then capped at zfs_delay_max_ns.
*
* The delay has two degrees of freedom that can be adjusted via tunables.
* The percentage of dirty data at which we start to delay is defined by
* zfs_delay_min_dirty_percent. This should typically be at or above
* zfs_vdev_async_write_active_max_dirty_percent so that we only start to
* delay after writing at full speed has failed to keep up with the incoming
* write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
* speaking, this variable determines the amount of delay at the midpoint of
* the curve.
*
* delay
* 10ms +-------------------------------------------------------------*+
* | *|
* 9ms + *+
* | *|
* 8ms + *+
* | * |
* 7ms + * +
* | * |
* 6ms + * +
* | * |
* 5ms + * +
* | * |
* 4ms + * +
* | * |
* 3ms + * +
* | * |
* 2ms + (midpoint) * +
* | | ** |
* 1ms + v *** +
* | zfs_delay_scale ----------> ******** |
* 0 +-------------------------------------*********----------------+
* 0% <- zfs_dirty_data_max -> 100%
*
* Note that since the delay is added to the outstanding time remaining on the
* most recent transaction, the delay is effectively the inverse of IOPS.
* Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
* was chosen such that small changes in the amount of accumulated dirty data
* in the first 3/4 of the curve yield relatively small differences in the
* amount of delay.
*
* The effects can be easier to understand when the amount of delay is
* represented on a log scale:
*
* delay
* 100ms +-------------------------------------------------------------++
* + +
* | |
* + *+
* 10ms + *+
* + ** +
* | (midpoint) ** |
* + | ** +
* 1ms + v **** +
* + zfs_delay_scale ----------> ***** +
* | **** |
* + **** +
* 100us + ** +
* + * +
* | * |
* + * +
* 10us + * +
* + +
* | |
* + +
* +--------------------------------------------------------------+
* 0% <- zfs_dirty_data_max -> 100%
*
* Note here that only as the amount of dirty data approaches its limit does
* the delay start to increase rapidly. The goal of a properly tuned system
* should be to keep the amount of dirty data out of that range by first
* ensuring that the appropriate limits are set for the I/O scheduler to reach
* optimal throughput on the backend storage, and then by changing the value
* of zfs_delay_scale to increase the steepness of the curve.
*/
static void
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
{
dsl_pool_t *dp = tx->tx_pool;
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
hrtime_t wakeup, min_tx_time, now;

if (dirty <= delay_min_bytes)
return;

/*
* The caller has already waited until we are under the max.
* We make them pass us the amount of dirty data so we don't
* have to handle the case of it being >= the max, which could
* cause a divide-by-zero if it's == the max.
*/
ASSERT3U(dirty, <, zfs_dirty_data_max);

now = gethrtime();
min_tx_time = zfs_delay_scale *
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
if (now > tx->tx_start + min_tx_time)
return;

DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
uint64_t, min_tx_time);

mutex_enter(&dp->dp_lock);
wakeup = MAX(tx->tx_start + min_tx_time,
dp->dp_last_wakeup + min_tx_time);
dp->dp_last_wakeup = wakeup;
mutex_exit(&dp->dp_lock);

zfs_sleep_until(wakeup);
}

static int
dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
{
Expand Down Expand Up @@ -965,6 +1104,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
return (SET_ERROR(ERESTART));
}

if (!tx->tx_waited &&
dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;
DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
return (ERESTART);
}

tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;

Expand Down Expand Up @@ -1092,6 +1238,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
* blocking, returns immediately with ERESTART. This should be used
* whenever you're holding locks. On an ERESTART error, the caller
* should drop locks, do a dmu_tx_wait(tx), and try again.
*
* (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
* has already been called on behalf of this operation (though
* most likely on a different tx).
*/
int
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
Expand All @@ -1100,11 +1250,15 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
int err;

ASSERT(tx->tx_txg == 0);
ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
txg_how == TXG_WAITED);
ASSERT(!dsl_pool_sync_context(tx->tx_pool));

before = gethrtime();

if (txg_how == TXG_WAITED)
tx->tx_waited = B_TRUE;

/* If we might wait, we must not hold the config lock. */
ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));

Expand All @@ -1128,17 +1282,47 @@ void
dmu_tx_wait(dmu_tx_t *tx)
{
spa_t *spa = tx->tx_pool->dp_spa;
dsl_pool_t *dp = tx->tx_pool;

ASSERT(tx->tx_txg == 0);
ASSERT(!dsl_pool_config_held(tx->tx_pool));

/*
* It's possible that the pool has become active after this thread
* has tried to obtain a tx. If that's the case then his
* tx_lasttried_txg would not have been assigned.
*/
if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
if (tx->tx_wait_dirty) {
uint64_t dirty;

/*
* dmu_tx_try_assign() has determined that we need to wait
* because we've consumed much or all of the dirty buffer
* space.
*/
mutex_enter(&dp->dp_lock);
if (dp->dp_dirty_total >= zfs_dirty_data_max)
DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
while (dp->dp_dirty_total >= zfs_dirty_data_max)
cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
dirty = dp->dp_dirty_total;
mutex_exit(&dp->dp_lock);

dmu_tx_delay(tx, dirty);

tx->tx_wait_dirty = B_FALSE;

/*
* Note: setting tx_waited only has effect if the caller
* used TX_WAIT. Otherwise they are going to destroy
* this tx and try again. The common case, zfs_write(),
* uses TX_WAIT.
*/
tx->tx_waited = B_TRUE;
} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
/*
* If the pool is suspended we need to wait until it
* is resumed. Note that it's possible that the pool
* has become active after this thread has tried to
* obtain a tx. If that's the case then tx_lasttried_txg
* would not have been set.
*/
txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
} else if (tx->tx_needassign_txh) {
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;

Expand All @@ -1148,6 +1332,10 @@ dmu_tx_wait(dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
tx->tx_needassign_txh = NULL;
} else {
/*
* A dnode is assigned to the quiescing txg. Wait for its
* transaction to complete.
*/
txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
}
}
Expand Down Expand Up @@ -1268,7 +1456,6 @@ dmu_tx_pool(dmu_tx_t *tx)
return (tx->tx_pool);
}


void
dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
{
Expand Down
6 changes: 5 additions & 1 deletion module/zfs/dmu_zfetch.c
Expand Up @@ -23,6 +23,10 @@
* Use is subject to license terms.
*/

/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
Expand Down Expand Up @@ -287,7 +291,7 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);

for (i = 0; i < fetchsz; i++) {
dbuf_prefetch(dn, blkid + i);
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
}

return (fetchsz);
Expand Down
17 changes: 8 additions & 9 deletions module/zfs/dnode.c
Expand Up @@ -1789,23 +1789,22 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
}

/*
* Call when we think we're going to write/free space in open context.
* Be conservative (ie. OK to write less than this or free more than
* this, but don't write more or free less).
* Call when we think we're going to write/free space in open context to track
* the amount of memory in use by the currently open txg.
*/
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_asize(os->os_spa, space);

if (space > 0)
space = spa_get_asize(os->os_spa, space);

if (ds)
dsl_dir_willuse_space(ds->ds_dir, space, tx);
if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}

dmu_tx_willuse_space(tx, space);
dmu_tx_willuse_space(tx, aspace);
}

/*
Expand Down
50 changes: 20 additions & 30 deletions module/zfs/dsl_dir.c
Expand Up @@ -589,7 +589,6 @@ dsl_dir_space_available(dsl_dir_t *dd,

struct tempreserve {
list_node_t tr_node;
dsl_pool_t *tr_dp;
dsl_dir_t *tr_ds;
uint64_t tr_size;
};
Expand Down Expand Up @@ -740,25 +739,24 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE);
tr->tr_size = lsize;
list_insert_tail(tr_list, tr);

err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else {
if (err == EAGAIN) {
/*
* If arc_memory_throttle() detected that pageout
* is running and we are low on memory, we delay new
* non-pageout transactions to give pageout an
* advantage.
*
* It is unfortunate to be delaying while the caller's
* locks are held.
*/
txg_delay(dd->dd_pool, tx->tx_txg,
MSEC2NSEC(10), MSEC2NSEC(10));
err = SET_ERROR(ERESTART);
}
dsl_pool_memory_pressure(dd->dd_pool);
}

if (err == 0) {
struct tempreserve *tr;

tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE);
tr->tr_dp = dd->dd_pool;
tr->tr_size = asize;
list_insert_tail(tr_list, tr);

err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
FALSE, asize > usize, tr_list, tx, TRUE);
}
Expand Down Expand Up @@ -787,10 +785,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
if (tr_cookie == NULL)
return;

while ((tr = list_head(tr_list))) {
if (tr->tr_dp) {
dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
} else if (tr->tr_ds) {
while ((tr = list_head(tr_list)) != NULL) {
if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
tr->tr_size);
Expand All @@ -806,8 +802,14 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
kmem_free(tr_list, sizeof (list_t));
}

static void
dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
/*
* This should be called from open context when we think we're going to write
* or free space, for example when dirtying data. Be conservative; it's okay
* to write less space or free more, but we don't want to write more or free
* less than the amount specified.
*/
void
dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
{
int64_t parent_space;
uint64_t est_used;
Expand All @@ -825,19 +827,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)

/* XXX this is potentially expensive and unnecessary... */
if (parent_space && dd->dd_parent)
dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
}

/*
* Call in open context when we think we're going to write/free space,
* eg. when dirtying data. Be conservative (ie. OK to write less than
* this or free more than this, but don't write more or free less).
*/
void
dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
{
dsl_pool_willuse_space(dd->dd_pool, space, tx);
dsl_dir_willuse_space_impl(dd, space, tx);
dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
}

/* call from syncing context when we actually write/free space for this dd */
Expand Down
360 changes: 183 additions & 177 deletions module/zfs/dsl_pool.c

Large diffs are not rendered by default.

5 changes: 1 addition & 4 deletions module/zfs/dsl_scan.c
Expand Up @@ -1650,7 +1650,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io = B_FALSE;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
int zio_priority = 0;
int scan_delay = 0;
int d;

Expand All @@ -1663,13 +1662,11 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
zio_priority = ZIO_PRIORITY_SCRUB;
needs_io = B_TRUE;
scan_delay = zfs_scrub_delay;
} else {
ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
zio_flags |= ZIO_FLAG_RESILVER;
zio_priority = ZIO_PRIORITY_RESILVER;
needs_io = B_FALSE;
scan_delay = zfs_resilver_delay;
}
Expand Down Expand Up @@ -1727,7 +1724,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
delay(scan_delay);

zio_nowait(zio_read(NULL, spa, bp, data, size,
dsl_scan_scrub_done, NULL, zio_priority,
dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
zio_flags, zb));
}

Expand Down
92 changes: 57 additions & 35 deletions module/zfs/spa.c
Expand Up @@ -83,7 +83,6 @@

typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
Expand Down Expand Up @@ -142,7 +141,7 @@ static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);

uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
id_t zio_taskq_psrset_bind = PS_NONE;
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
uint_t zio_taskq_basedc = 80; /* base duty cycle */
Expand Down Expand Up @@ -837,31 +836,27 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);

for (i = 0; i < count; i++) {
taskq_t *tq;

switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;

case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;

case ZTI_MODE_ONLINE_PERCENT:
flags |= TASKQ_THREADS_CPU_PCT;
break;
default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}

default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
for (i = 0; i < count; i++) {
taskq_t *tq;

if (count > 1) {
(void) snprintf(name, sizeof (name), "%s_%s_%u",
Expand All @@ -878,7 +873,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
tq = taskq_create_proc(name, value, maxclsyspri, 50,
pri_t pri = maxclsyspri;
/*
* The write issue taskq can be extremely CPU
* intensive. Run it at slightly lower priority
* than the other taskqs.
*/
if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
pri--;

tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
}

Expand Down Expand Up @@ -5775,6 +5779,31 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
return (0);
}

/*
* Note: this simple function is not inlined to make it easier to dtrace the
* amount of time spent syncing frees.
*/
static void
spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
{
zio_t *zio = zio_root(spa, NULL, NULL, 0);
bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
VERIFY(zio_wait(zio) == 0);
}

/*
* Note: this simple function is not inlined to make it easier to dtrace the
* amount of time spent syncing deferred frees.
*/
static void
spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
{
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
VERIFY0(zio_wait(zio));
}

static void
spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -6102,7 +6131,6 @@ spa_sync(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
Expand Down Expand Up @@ -6185,10 +6213,7 @@ spa_sync(spa_t *spa, uint64_t txg)
!txg_list_empty(&dp->dp_sync_tasks, txg) ||
((dsl_scan_active(dp->dp_scan) ||
txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(defer_bpo,
spa_free_sync_cb, zio, tx), ==, 0);
VERIFY0(zio_wait(zio));
spa_sync_deferred_frees(spa, tx);
}

/*
Expand All @@ -6206,13 +6231,10 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync(dp, txg);

if (pass < zfs_sync_pass_deferred_free) {
zio_t *zio = zio_root(spa, NULL, NULL, 0);
bplist_iterate(free_bpl, spa_free_sync_cb,
zio, tx);
VERIFY(zio_wait(zio) == 0);
spa_sync_frees(spa, free_bpl, tx);
} else {
bplist_iterate(free_bpl, bpobj_enqueue_cb,
defer_bpo, tx);
&spa->spa_deferred_bpobj, tx);
}

ddt_sync(spa, txg);
Expand Down
46 changes: 26 additions & 20 deletions module/zfs/spa_misc.c
Expand Up @@ -238,21 +238,31 @@ kmem_cache_t *spa_buffer_pool;
int spa_mode_global;

/*
* Expiration time in units of zfs_txg_synctime_ms. This value has two
* meanings. First it is used to determine when the spa_deadman logic
* should fire. By default the spa_deadman will fire if spa_sync has
* not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
* Secondly, the value determines if an I/O is considered "hung".
* Any I/O that has not completed in zfs_deadman_synctime is considered
* "hung" resulting in a zevent being posted.
* Expiration time in milliseconds. This value has two meanings. First it is
* used to determine when the spa_deadman() logic should fire. By default the
* spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
* Secondly, the value determines if an I/O is considered "hung". Any I/O that
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
* in a system panic.
*/
unsigned long zfs_deadman_synctime = 1000ULL;
unsigned long zfs_deadman_synctime_ms = 1000000ULL;

/*
* By default the deadman is enabled.
*/
int zfs_deadman_enabled = 1;

/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
* times the size; so just assume that. Add to this the fact that
* we can have up to 3 DVAs per bp, and one more factor of 2 because
* the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
* the worst case is:
* (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
*/
int spa_asize_inflation = 24;

/*
* ==========================================================================
* SPA config locking
Expand Down Expand Up @@ -489,8 +499,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE;

spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
zfs_txg_synctime_ms);
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);

refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
Expand Down Expand Up @@ -1452,14 +1461,7 @@ spa_freeze_txg(spa_t *spa)
uint64_t
spa_get_asize(spa_t *spa, uint64_t lsize)
{
/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
* times the size; so just assume that. Add to this the fact that
* we can have up to 3 DVAs per bp, and one more factor of 2 because
* the block may be dittoed with up to 3 DVAs by ddt_sync().
*/
return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
return (lsize * spa_asize_inflation);
}

uint64_t
Expand Down Expand Up @@ -1880,9 +1882,13 @@ EXPORT_SYMBOL(spa_mode);

EXPORT_SYMBOL(spa_namespace_lock);

module_param(zfs_deadman_synctime, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
module_param(zfs_deadman_synctime_ms, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_synctime_ms,"Expiration time in milliseconds");

module_param(zfs_deadman_enabled, int, 0644);
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");

module_param(spa_asize_inflation, int, 0644);
MODULE_PARM_DESC(spa_asize_inflation,
"SPA size estimate multiplication factor");
#endif
33 changes: 28 additions & 5 deletions module/zfs/txg.c
Expand Up @@ -46,15 +46,15 @@
* either be processing, or blocked waiting to enter the next state. There may
* be up to three active txgs, and there is always a txg in the open state
* (though it may be blocked waiting to enter the quiescing state). In broad
* strokes, transactions — operations that change in-memory structures — are
* strokes, transactions -- operations that change in-memory structures -- are
* accepted into the txg in the open state, and are completed while the txg is
* in the open or quiescing states. The accumulated changes are written to
* disk in the syncing state.
*
* Open
*
* When a new txg becomes active, it first enters the open state. New
* transactions — updates to in-memory structures — are assigned to the
* transactions -- updates to in-memory structures -- are assigned to the
* currently open txg. There is always a txg in the open state so that ZFS can
* accept new changes (though the txg may refuse new changes if it has hit
* some limit). ZFS advances the open txg to the next state for a variety of
Expand Down Expand Up @@ -375,6 +375,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)

ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
tx->tx_open_time = gethrtime();

spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
Expand Down Expand Up @@ -511,7 +512,8 @@ txg_sync_thread(dsl_pool_t *dp)
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0) {
tx->tx_quiesced_txg == 0 &&
dp->dp_dirty_total < zfs_dirty_data_sync) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
Expand Down Expand Up @@ -574,8 +576,7 @@ txg_sync_thread(dsl_pool_t *dp)
vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
dp->dp_space_towrite[txg & TXG_MASK] +
dp->dp_tempreserved[txg & TXG_MASK] / 2);
dp->dp_dirty_pertxg[txg & TXG_MASK]);
spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}
Expand Down Expand Up @@ -705,6 +706,28 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
mutex_exit(&tx->tx_sync_lock);
}

/*
* If there isn't a txg syncing or in the pipeline, push another txg through
* the pipeline by queiscing the open txg.
*/
void
txg_kick(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;

ASSERT(!dsl_pool_config_held(dp));

mutex_enter(&tx->tx_sync_lock);
if (tx->tx_syncing_txg == 0 &&
tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
tx->tx_quiesced_txg <= tx->tx_synced_txg) {
tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
cv_broadcast(&tx->tx_quiesce_more_cv);
}
mutex_exit(&tx->tx_sync_lock);
}

boolean_t
txg_stalled(dsl_pool_t *dp)
{
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/vdev.c
Expand Up @@ -3296,7 +3296,7 @@ vdev_deadman(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;

mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_pending_tree) > 0) {
if (avl_numnodes(&vq->vq_active_tree) > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
Expand All @@ -3306,7 +3306,7 @@ vdev_deadman(vdev_t *vd)
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime we log a zevent.
*/
fio = avl_first(&vq->vq_pending_tree);
fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa)) {
zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/vdev_cache.c
Expand Up @@ -312,7 +312,7 @@ vdev_cache_read(zio_t *zio)
}

fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);

ve->ve_fill_io = fio;
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/vdev_mirror.c
Expand Up @@ -89,7 +89,7 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
static int
vdev_mirror_pending(vdev_t *vd)
{
return (avl_numnodes(&vd->vdev_queue.vq_pending_tree));
return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
}

/*
Expand Down Expand Up @@ -499,7 +499,7 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_data, zio->io_size,
ZIO_TYPE_WRITE, zio->io_priority,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
Expand Down
734 changes: 522 additions & 212 deletions module/zfs/vdev_queue.c

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion module/zfs/vdev_raidz.c
Expand Up @@ -2188,7 +2188,7 @@ vdev_raidz_io_done(zio_t *zio)

zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size,
ZIO_TYPE_WRITE, zio->io_priority,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
Expand Down
2 changes: 0 additions & 2 deletions module/zfs/zfs_fm.c
Expand Up @@ -316,8 +316,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, zio->io_delay, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
DATA_TYPE_UINT64, zio->io_timestamp, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
DATA_TYPE_UINT64, zio->io_deadline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
DATA_TYPE_UINT64, zio->io_delta, NULL);

Expand Down
37 changes: 28 additions & 9 deletions module/zfs/zfs_vnops.c
Expand Up @@ -125,7 +125,11 @@
* forever, because the previous txg can't quiesce until B's tx commits.
*
* If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT,
* then drop all locks, call dmu_tx_wait(), and try again.
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
* calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
* to indicate that this operation has already called dmu_tx_wait().
* This will ensure that we don't retry forever, waiting a short bit
* each time.
*
* (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events
Expand All @@ -147,12 +151,13 @@
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
* error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
* error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
* if (error) {
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* iput(...); // release held vnodes
* if (error == ERESTART) {
* waited = B_TRUE;
* dmu_tx_wait(tx);
* dmu_tx_abort(tx);
* goto top;
Expand Down Expand Up @@ -1279,6 +1284,7 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
boolean_t have_acl = B_FALSE;
boolean_t waited = B_FALSE;

/*
* If we have an ephemeral id, ACL, or XVATTR then
Expand Down Expand Up @@ -1391,10 +1397,11 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -1524,6 +1531,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr)
#endif /* HAVE_PN_UTILS */
int error;
int zflg = ZEXISTS;
boolean_t waited = B_FALSE;

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(dzp);
Expand Down Expand Up @@ -1599,13 +1607,14 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr)
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);

error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
iput(ip);
if (xzp)
iput(ZTOI(xzp));
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -1710,6 +1719,7 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
boolean_t waited = B_FALSE;

ASSERT(S_ISDIR(vap->va_mode));

Expand Down Expand Up @@ -1801,10 +1811,11 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);

error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -1882,6 +1893,7 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
dmu_tx_t *tx;
int error;
int zflg = ZEXISTS;
boolean_t waited = B_FALSE;

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(dzp);
Expand Down Expand Up @@ -1935,13 +1947,14 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
rw_exit(&zp->z_parent_lock);
rw_exit(&zp->z_name_lock);
zfs_dirent_unlock(dl);
iput(ip);
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -3169,6 +3182,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
int cmp, serr, terr;
int error = 0;
int zflg = 0;
boolean_t waited = B_FALSE;

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(sdzp);
Expand Down Expand Up @@ -3383,7 +3397,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,

zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
if (zl != NULL)
zfs_rename_unlock(&zl);
Expand All @@ -3397,6 +3411,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
if (tzp)
iput(ZTOI(tzp));
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -3504,6 +3519,7 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
uint64_t txtype = TX_SYMLINK;
boolean_t waited = B_FALSE;

ASSERT(S_ISLNK(vap->va_mode));

Expand Down Expand Up @@ -3568,10 +3584,11 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
}
if (fuid_dirtied)
zfs_fuid_txhold(zsb, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down Expand Up @@ -3699,6 +3716,7 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr)
int zf = ZNEW;
uint64_t parent;
uid_t owner;
boolean_t waited = B_FALSE;

ASSERT(S_ISDIR(tdip->i_mode));

Expand Down Expand Up @@ -3782,10 +3800,11 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr)
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/zil.c
Expand Up @@ -913,7 +913,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
}
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_FASTWRITE, &zb);
}
Expand Down
83 changes: 39 additions & 44 deletions module/zfs/zio.c
Expand Up @@ -37,32 +37,12 @@
#include <sys/arc.h>
#include <sys/ddt.h>

/*
* ==========================================================================
* I/O priority table
* ==========================================================================
*/
uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
0, /* ZIO_PRIORITY_LOG_WRITE */
1, /* ZIO_PRIORITY_CACHE_FILL */
1, /* ZIO_PRIORITY_AGG */
4, /* ZIO_PRIORITY_FREE */
4, /* ZIO_PRIORITY_ASYNC_WRITE */
6, /* ZIO_PRIORITY_ASYNC_READ */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
2, /* ZIO_PRIORITY_DDT_PREFETCH */
};

/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
const char *zio_type_name[ZIO_TYPES] = {
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
};

Expand Down Expand Up @@ -549,7 +529,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
*errorp = zio_worst_error(*errorp, zio->io_error);
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
if (--*countp == 0 && pio->io_stall == countp) {

(*countp)--;

if (*countp == 0 && pio->io_stall == countp) {
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
__zio_execute(pio);
Expand All @@ -573,7 +556,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
zio_type_t type, int priority, enum zio_flag flags,
zio_type_t type, zio_priority_t priority, enum zio_flag flags,
vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
enum zio_stage stage, enum zio_stage pipeline)
{
Expand Down Expand Up @@ -620,6 +603,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_spa = spa;
zio->io_txg = txg;
zio->io_ready = NULL;
zio->io_physdone = NULL;
zio->io_done = done;
zio->io_private = private;
zio->io_prev_space_delta = 0;
Expand All @@ -629,7 +613,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_vsd = NULL;
zio->io_vsd_ops = NULL;
zio->io_offset = offset;
zio->io_deadline = 0;
zio->io_timestamp = 0;
zio->io_delta = 0;
zio->io_delay = 0;
Expand All @@ -646,6 +629,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_transform_stack = NULL;
zio->io_error = 0;
zio->io_child_count = 0;
zio->io_phys_children = 0;
zio->io_parent_count = 0;
zio->io_stall = NULL;
zio->io_gang_leader = NULL;
Expand Down Expand Up @@ -706,7 +690,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, const zbookmark_t *zb)
zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;

Expand All @@ -722,8 +706,9 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, const zbookmark_t *zb)
zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;

Expand All @@ -742,15 +727,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);

zio->io_ready = ready;
zio->io_physdone = physdone;
zio->io_prop = *zp;

return (zio);
}

zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private, int priority,
enum zio_flag flags, zbookmark_t *zb)
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
{
zio_t *zio;

Expand Down Expand Up @@ -829,7 +815,6 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, stage);


return (zio);
}

Expand Down Expand Up @@ -864,14 +849,14 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,

zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
zio_done_func_t *done, void *private, enum zio_flag flags)
{
zio_t *zio;
int c;

if (vd->vdev_children == 0) {
zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);

zio->io_cmd = cmd;
Expand All @@ -880,7 +865,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,

for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
done, private, priority, flags));
done, private, flags));
}

return (zio);
Expand All @@ -889,7 +874,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, boolean_t labels)
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;

Expand All @@ -910,7 +895,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
int priority, enum zio_flag flags, boolean_t labels)
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;

Expand Down Expand Up @@ -945,8 +930,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, int priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
Expand Down Expand Up @@ -981,12 +966,16 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);

zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
zio->io_logical->io_phys_children++;

return (zio);
}

zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
int type, int priority, enum zio_flag flags,
int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
Expand All @@ -995,7 +984,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,

zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
data, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
vd, offset, NULL,
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);

Expand All @@ -1006,7 +995,7 @@ void
zio_flush(zio_t *zio, vdev_t *vd)
{
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
NULL, NULL, ZIO_PRIORITY_NOW,
NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}

Expand Down Expand Up @@ -1951,7 +1940,7 @@ zio_write_gang_block(zio_t *pio)

zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
zio_write_gang_member_ready, NULL, &gn->gn_child[g],
zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark));
}
Expand Down Expand Up @@ -2335,7 +2324,7 @@ zio_ddt_write(zio_t *zio)
}

dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
zio->io_orig_size, &czp, NULL,
zio->io_orig_size, &czp, NULL, NULL,
zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

Expand All @@ -2357,7 +2346,7 @@ zio_ddt_write(zio_t *zio)
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
zio->io_orig_size, zp, zio_ddt_child_write_ready,
zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

Expand Down Expand Up @@ -2780,6 +2769,13 @@ zio_vdev_io_assess(zio_t *zio)
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
zio->io_physdone(zio->io_logical);
}

return (ZIO_PIPELINE_CONTINUE);
}

Expand Down Expand Up @@ -3346,7 +3342,6 @@ EXPORT_SYMBOL(zio_clear_fault);
EXPORT_SYMBOL(zio_handle_fault_injection);
EXPORT_SYMBOL(zio_handle_device_injection);
EXPORT_SYMBOL(zio_handle_label_injection);
EXPORT_SYMBOL(zio_priority_table);
EXPORT_SYMBOL(zio_type_name);

module_param(zio_bulk_flags, int, 0644);
Expand Down