diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index aede315502d9..b853081e8b7c 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -167,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index aa00acec8e24..2d7d78019209 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -733,6 +733,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], + sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], @@ -1507,6 +1509,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -1539,13 +1607,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. - * Note that we won't dirty any indirect blocks, - * which is fine because we will be freeing the entire - * file and thus all indirect blocks will be freed - * by free_children(). */ blkid = 0; nblks = 1; + if (dn->dn_nlevels > 1) + dnode_dirty_l1(dn, 0, tx); goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ @@ -1658,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); + dnode_dirty_l1range(dn, first, last, tx); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 2fcaf7927de6..2ee75c90c2fc 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; @@ -248,6 +263,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + /* + * If we modify this indirect block, and we are not freeing the + * dnode (!free_indirects), then this indirect block needs to get + * written to disk by dbuf_write(). If it is dirty, we know it will + * be written (otherwise, we would have incorrect on-disk state + * because the space would be freed but still referenced by the BP + * in this indirect block). Therefore we VERIFY that it is + * dirty. + * + * Our VERIFY covers some cases that do not actually have to be + * dirty, but the open-context code happens to dirty. E.g. if the + * blocks we are freeing are all holes, because in that case, we + * are only freeing part of this indirect block, so it is an + * ancestor of the first or last block to be freed. The first and + * last L1 indirect blocks are always dirtied by dnode_free_range(). + */ + VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dbuf_release_bp(db); bp = db->db.db_data; @@ -283,32 +316,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); - free_children(subdb, blkid, nblks, tx); + free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } - /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); free_blocks(dn, db->db_blkptr, 1, tx); - } else { - /* - * Partial block free; must be marked dirty so that it - * will be written out. - */ - ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); @@ -321,7 +338,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; @@ -361,7 +378,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, tx); + free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } @@ -380,6 +397,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void @@ -389,7 +407,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } @@ -670,6 +689,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 73e17a4a3c18..93545ee4a10a 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1603,7 +1603,8 @@ zfs_trunc(znode_t *zp, uint64_t end) return (0); } - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); if (error) { zfs_range_unlock(rl); return (error);