84 changes: 9 additions & 75 deletions module/zfs/dbuf.c
Expand Up @@ -1432,41 +1432,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
mutex_exit(&dn->dn_dbufs_mtx);
}

static int
dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;

/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*
* This logic ensures that only block births for
* filled blocks are considered.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_last_dirty && (db->db_blkptr == NULL ||
!BP_IS_HOLE(db->db_blkptr))) {
birth_txg = db->db_last_dirty->dr_txg;
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
birth_txg = db->db_blkptr->blk_birth;
}

/*
* If this block don't exist or is in a snapshot, it can't be freed.
* Don't pass the bp to dsl_dataset_block_freeable() since we
* are holding the db_mtx lock and might deadlock if we are
* prefetching a dedup-ed block.
*/
if (birth_txg != 0)
return (ds == NULL ||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (B_FALSE);
}

void
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -1516,7 +1481,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);

dnode_willuse_space(dn, size-osize, tx);
dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
DB_DNODE_EXIT(db);
}

Expand Down Expand Up @@ -1566,7 +1531,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
int txgoff = tx->tx_txg & TXG_MASK;

ASSERT(tx->tx_txg != 0);
Expand Down Expand Up @@ -1688,15 +1652,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);

if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* Update the accounting.
* Note: we delay "free accounting" until after we drop
* the db_mtx. This keeps us from grabbing other locks
* (and possibly deadlocking) in bp_get_dsize() while
* also holding the db_mtx.
*/
dnode_willuse_space(dn, db->db.db_size, tx);
do_free_accounting = dbuf_block_freeable(db);
dmu_objset_willuse_space(os, db->db.db_size, tx);
}

/*
Expand Down Expand Up @@ -1790,21 +1746,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drop_struct_lock = TRUE;
}

if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
/*
* This is only a guess -- if the dbuf is dirty
* in a previous txg, we don't know how much
* space it will use on disk yet. We should
* really have the struct_rwlock to access
* db_blkptr, but since this is just a guess,
* it's OK if we get an odd answer.
*/
ddt_prefetch(os->os_spa, bp);
dnode_willuse_space(dn, -willfree, tx);
}
/*
* If we are overwriting a dedup BP, then unless it is snapshotted,
* when we get to syncing context we will need to decrement its
* refcount in the DDT. Prefetch the relevant DDT block so that
* syncing context won't have to wait for the i/o.
*/
ddt_prefetch(os->os_spa, db->db_blkptr);

if (db->db_level == 0) {
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
Expand Down Expand Up @@ -3092,19 +3040,6 @@ dmu_buf_user_evict_wait()
taskq_wait(dbu_evict_taskq);
}

boolean_t
dmu_buf_freeable(dmu_buf_t *dbuf)
{
boolean_t res = B_FALSE;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;

if (db->db_blkptr)
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
db->db_blkptr, db->db_blkptr->blk_birth);

return (res);
}

blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t *db)
{
Expand Down Expand Up @@ -3891,7 +3826,6 @@ EXPORT_SYMBOL(dbuf_sync_list);
EXPORT_SYMBOL(dmu_buf_set_user);
EXPORT_SYMBOL(dmu_buf_set_user_ie);
EXPORT_SYMBOL(dmu_buf_get_user);
EXPORT_SYMBOL(dmu_buf_freeable);
EXPORT_SYMBOL(dmu_buf_get_blkptr);

/* BEGIN CSTYLED */
Expand Down
17 changes: 17 additions & 0 deletions module/zfs/dmu_objset.c
Expand Up @@ -2344,6 +2344,23 @@ dmu_fsname(const char *snapname, char *buf)
return (0);
}

/*
* Call when we think we're going to write/free space in open context to track
* the amount of dirty data in the open txg, which is also the amount
* of memory that can not be evicted until this txg syncs.
*/
void
dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);

if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}
}

#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(dmu_objset_zil);
EXPORT_SYMBOL(dmu_objset_pool);
Expand Down
845 changes: 176 additions & 669 deletions module/zfs/dmu_tx.c

Large diffs are not rendered by default.

19 changes: 0 additions & 19 deletions module/zfs/dnode.c
Expand Up @@ -1948,25 +1948,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
mutex_exit(&dn->dn_mtx);
}

/*
* Call when we think we're going to write/free space in open context to track
* the amount of memory in use by the currently open txg.
*/
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_asize(os->os_spa, space);

if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}

dmu_tx_willuse_space(tx, aspace);
}

/*
* Scans a block at the indicated "level" looking for a hole or data,
* depending on 'flags'.
Expand Down
38 changes: 0 additions & 38 deletions module/zfs/dsl_dataset.c
Expand Up @@ -242,42 +242,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
return (used);
}

uint64_t
dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
{
uint64_t trysnap = 0;

if (ds == NULL)
return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
* overestimation of the amount of space that an operation would
* consume, which is OK.
*
* There's also a small window where we could miss a pending
* snapshot, because we could set the sync task in the quiescing
* phase. So this should only be used as a guess.
*/
if (ds->ds_trysnap_txg >
spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
trysnap = ds->ds_trysnap_txg;
return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
}

boolean_t
dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
uint64_t blk_birth)
{
if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
(bp != NULL && BP_IS_HOLE(bp)))
return (B_FALSE);

ddt_prefetch(dsl_dataset_get_spa(ds), bp);

return (B_TRUE);
}

/*
* We have to release the fsid syncronously or we risk that a subsequent
* mount of the same dataset will fail to unique_insert the fsid. This
Expand Down Expand Up @@ -3731,8 +3695,6 @@ EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
EXPORT_SYMBOL(dsl_dataset_sync);
EXPORT_SYMBOL(dsl_dataset_block_born);
EXPORT_SYMBOL(dsl_dataset_block_kill);
EXPORT_SYMBOL(dsl_dataset_block_freeable);
EXPORT_SYMBOL(dsl_dataset_prev_snap_txg);
EXPORT_SYMBOL(dsl_dataset_dirty);
EXPORT_SYMBOL(dsl_dataset_stats);
EXPORT_SYMBOL(dsl_dataset_fast_stat);
Expand Down
40 changes: 18 additions & 22 deletions module/zfs/dsl_dir.c
Expand Up @@ -1031,13 +1031,12 @@ static uint64_t
dsl_dir_space_towrite(dsl_dir_t *dd)
{
uint64_t space = 0;
int i;

ASSERT(MUTEX_HELD(&dd->dd_lock));

for (i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i&TXG_MASK];
ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
for (int i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i & TXG_MASK];
ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
}
return (space);
}
Expand Down Expand Up @@ -1117,16 +1116,13 @@ struct tempreserve {

static int
dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
boolean_t ignorequota, list_t *tr_list,
dmu_tx_t *tx, boolean_t first)
{
uint64_t txg = tx->tx_txg;
uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
uint64_t deferred = 0;
uint64_t quota;
struct tempreserve *tr;
int retval = EDQUOT;
int txgidx = txg & TXG_MASK;
int i;
uint64_t ref_rsrv = 0;

ASSERT3U(txg, !=, 0);
Expand All @@ -1138,10 +1134,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* Check against the dsl_dir's quota. We don't add in the delta
* when checking for over-quota because they get one free hit.
*/
est_inflight = dsl_dir_space_towrite(dd);
for (i = 0; i < TXG_SIZE; i++)
uint64_t est_inflight = dsl_dir_space_towrite(dd);
for (int i = 0; i < TXG_SIZE; i++)
est_inflight += dd->dd_tempreserved[i];
used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;

/*
* On the first iteration, fetch the dataset's used-on-disk and
Expand All @@ -1152,9 +1148,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
int error;
dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;

error = dsl_dataset_check_quota(ds, checkrefquota,
error = dsl_dataset_check_quota(ds, !netfree,
asize, est_inflight, &used_on_disk, &ref_rsrv);
if (error) {
if (error != 0) {
mutex_exit(&dd->dd_lock);
DMU_TX_STAT_BUMP(dmu_tx_quota);
return (error);
Expand All @@ -1180,6 +1176,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
uint64_t deferred = 0;
if (dd->dd_parent == NULL) {
spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
Expand Down Expand Up @@ -1210,9 +1207,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
}

/* We need to up our estimated delta before dropping dd_lock */
dd->dd_tempreserved[txgidx] += asize;
dd->dd_tempreserved[txg & TXG_MASK] += asize;

parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
asize - ref_rsrv);
mutex_exit(&dd->dd_lock);

Expand All @@ -1222,11 +1219,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
list_insert_tail(tr_list, tr);

/* see if it's OK with our parent */
if (dd->dd_parent && parent_rsrv) {
if (dd->dd_parent != NULL && parent_rsrv != 0) {
boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);

return (dsl_dir_tempreserve_impl(dd->dd_parent,
parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
} else {
return (0);
}
Expand All @@ -1240,7 +1237,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
*/
int
dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
{
int err;
list_t *tr_list;
Expand All @@ -1254,7 +1251,6 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
list_create(tr_list, sizeof (struct tempreserve),
offsetof(struct tempreserve, tr_node));
ASSERT3S(asize, >, 0);
ASSERT3S(fsize, >=, 0);

err = arc_tempreserve_space(lsize, tx->tx_txg);
if (err == 0) {
Expand All @@ -1281,8 +1277,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
}

if (err == 0) {
err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
FALSE, asize > usize, tr_list, tx, TRUE);
err = dsl_dir_tempreserve_impl(dd, asize, netfree,
B_FALSE, tr_list, tx, B_TRUE);
}

if (err != 0)
Expand Down
3 changes: 1 addition & 2 deletions module/zfs/spa_misc.c
Expand Up @@ -1615,7 +1615,7 @@ spa_freeze_txg(spa_t *spa)

/* ARGSUSED */
uint64_t
spa_get_asize(spa_t *spa, uint64_t lsize)
spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
{
return (lsize * spa_asize_inflation);
}
Expand Down Expand Up @@ -2078,7 +2078,6 @@ EXPORT_SYMBOL(spa_version);
EXPORT_SYMBOL(spa_state);
EXPORT_SYMBOL(spa_load_state);
EXPORT_SYMBOL(spa_freeze_txg);
EXPORT_SYMBOL(spa_get_asize);
EXPORT_SYMBOL(spa_get_dspace);
EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate);
Expand Down
61 changes: 0 additions & 61 deletions module/zfs/zap.c
Expand Up @@ -1357,64 +1357,3 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
}

int
fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
refcount_t *tooverwrite)
{
zap_t *zap = zn->zn_zap;
zap_leaf_t *l;
int err;

/*
* Account for the header block of the fatzap.
*/
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
(void) refcount_add_many(tooverwrite,
zap->zap_dbuf->db_size, FTAG);
} else {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size, FTAG);
}

/*
* Account for the pointer table blocks.
* If we are adding we need to account for the following cases :
* - If the pointer table is embedded, this operation could force an
* external pointer table.
* - If this already has an external pointer table this operation
* could extend the table.
*/
if (add) {
if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size, FTAG);
} else {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size * 3, FTAG);
}
}

/*
* Now, check if the block containing leaf is freeable
* and account accordingly.
*/
err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0) {
return (err);
}

if (!add && dmu_buf_freeable(l->l_dbuf)) {
(void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
} else {
/*
* If this an add operation, the leaf block could split.
* Hence, we need to account for an additional leaf block.
*/
(void) refcount_add_many(towrite,
(add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
}

zap_put_leaf(l);
return (0);
}
83 changes: 0 additions & 83 deletions module/zfs/zap_micro.c
Expand Up @@ -1594,88 +1594,6 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
return (0);
}

int
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
refcount_t *towrite, refcount_t *tooverwrite)
{
zap_t *zap;
int err = 0;

/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
* - 4 new blocks written if adding:
* - 2 blocks for possibly split leaves,
* - 2 grown ptrtbl blocks
*
* This also accommodates the case where an add operation to a fairly
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
(void) refcount_add_many(towrite,
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
return (err);
}

/*
* We lock the zap with adding == FALSE. Because, if we pass
* the actual value of add, it could trigger a mzap_upgrade().
* At present we are just evaluating the possibility of this operation
* and hence we do not want to trigger an upgrade.
*/
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);

if (!zap->zap_ismicro) {
zap_name_t *zn = zap_name_alloc(zap, name, 0);
if (zn) {
err = fzap_count_write(zn, add, towrite,
tooverwrite);
zap_name_free(zn);
} else {
/*
* We treat this case as similar to (name == NULL)
*/
(void) refcount_add_many(towrite,
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
}
} else {
/*
* We are here if (name != NULL) and this is a micro-zap.
* We account for the header block depending on whether it
* is freeable.
*
* Incase of an add-operation it is hard to find out
* if this add will promote this microzap to fatzap.
* Hence, we consider the worst case and account for the
* blocks assuming this microzap would be promoted to a
* fatzap.
*
* 1 block overwritten : header block
* 4 new blocks written : 2 new split leaf, 2 grown
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf)) {
(void) refcount_add_many(tooverwrite,
MZAP_MAX_BLKSZ, FTAG);
} else {
(void) refcount_add_many(towrite,
MZAP_MAX_BLKSZ, FTAG);
}

if (add) {
(void) refcount_add_many(towrite,
4 * MZAP_MAX_BLKSZ, FTAG);
}
}

zap_unlockdir(zap, FTAG);
return (err);
}

#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zap_create);
EXPORT_SYMBOL(zap_create_dnsize);
Expand All @@ -1694,7 +1612,6 @@ EXPORT_SYMBOL(zap_lookup_uint64);
EXPORT_SYMBOL(zap_contains);
EXPORT_SYMBOL(zap_prefetch);
EXPORT_SYMBOL(zap_prefetch_uint64);
EXPORT_SYMBOL(zap_count_write_by_dnode);
EXPORT_SYMBOL(zap_add);
EXPORT_SYMBOL(zap_add_by_dnode);
EXPORT_SYMBOL(zap_add_uint64);
Expand Down