Skip to content

Commit

Permalink
Fix handling of maxblkid for raw sends
Browse files Browse the repository at this point in the history
Currently, the receive code can create an unreadable dataset from
a correct raw send stream. This is because it is currently
impossible to set maxblkid to a lower value without freeing the
associated object. This means truncating files on the send side
to a non-0 size could result in corruption. This patch solves this
issue by adding a new 'force' flag to dnode_new_blkid() which will
allow the raw receive code to force the DMU to accept the provided
maxblkid even if it is a lower value than the existing one.

For testing purposes the send_encrypted_files.ksh test has been
extended to include a variety of truncated files and multiple
snapshots. It also now leverages the xattrtest command to help
ensure raw receives correctly handle xattrs.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8168 
Closes #8487
  • Loading branch information
Tom Caputi authored and behlendorf committed Mar 13, 2019
1 parent 146bdc4 commit 369aa50
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 55 deletions.
3 changes: 2 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,8 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,

/*
* Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
* to accommodate the change.
* to accommodate the change. When calling this function, the caller must
* ensure that the object's nlevels can sufficiently support the new maxblkid.
*/
int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
dmu_tx_t *tx);
Expand Down
9 changes: 8 additions & 1 deletion include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,12 @@ struct dnode {
struct zfetch dn_zfetch;
};

/*
* We use this (otherwise unused) bit to indicate if the value of
* dn_next_maxblkid[txgoff] is valid to use in dnode_sync().
*/
#define DMU_NEXT_MAXBLKID_SET (1ULL << 63)

/*
* Adds a level of indirection between the dbuf and the dnode to avoid
* iterating descendent dbufs in dnode_move(). Handles are not allocated
Expand Down Expand Up @@ -423,7 +429,8 @@ int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
void dnode_diduse_space(dnode_t *dn, int64_t space);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
boolean_t have_read, boolean_t force);
uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
void dnode_init(void);
void dnode_fini(void);
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level == 0) {
ASSERT(!db->db_objset->os_raw_receive ||
dn->dn_maxblkid >= db->db_blkid);
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
dnode_new_blkid(dn, db->db_blkid, tx,
drop_struct_lock, B_FALSE);
ASSERT(dn->dn_maxblkid >= db->db_blkid);
}

Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2155,7 +2155,7 @@ dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
if (err)
return (err);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_new_blkid(dn, maxblkid, tx, B_FALSE);
dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (0);
Expand Down
62 changes: 51 additions & 11 deletions module/zfs/dmu_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1179,10 +1179,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,

object = drro->drr_object;

/* nblkptr will be bounded by the bonus size and type */
/* nblkptr should be bounded by the bonus size and type */
if (rwa->raw && nblkptr != drro->drr_nblkptr)
return (SET_ERROR(EINVAL));

/*
* Check for indicators that the object was freed and
* reallocated. For all sends, these indicators are:
* - A changed block size
* - A smaller nblkptr
* - A changed dnode size
* For raw sends we also check a few other fields to
* ensure we are preserving the objset structure exactly
* as it was on the receive side:
* - A changed indirect block size
* - A smaller nlevels
*/
if (drro->drr_blksz != doi.doi_data_block_size ||
nblkptr < doi.doi_nblkptr ||
dn_slots != doi.doi_dnodesize >> DNODE_SHIFT ||
Expand All @@ -1197,13 +1209,14 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,

/*
* The dmu does not currently support decreasing nlevels
* on an object. For non-raw sends, this does not matter
* and the new object can just use the previous one's nlevels.
* For raw sends, however, the structure of the received dnode
* (including nlevels) must match that of the send side.
* Therefore, instead of using dmu_object_reclaim(), we must
* free the object completely and call dmu_object_claim_dnsize()
* instead.
* or changing the number of dnode slots on an object. For
* non-raw sends, this does not matter and the new object
* can just use the previous one's nlevels. For raw sends,
* however, the structure of the received dnode (including
* nlevels and dnode slots) must match that of the send
* side. Therefore, instead of using dmu_object_reclaim(),
* we must free the object completely and call
* dmu_object_claim_dnsize() instead.
*/
if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
Expand All @@ -1214,6 +1227,23 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
txg_wait_synced(dmu_objset_pool(rwa->os), 0);
object = DMU_NEW_OBJECT;
}

/*
* For raw receives, free everything beyond the new incoming
* maxblkid. Normally this would be done with a DRR_FREE
* record that would come after this DRR_OBJECT record is
* processed. However, for raw receives we manually set the
* maxblkid from the drr_maxblkid and so we must first free
* everything above that blkid to ensure the DMU is always
* consistent with itself.
*/
if (rwa->raw) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
(drro->drr_maxblkid + 1) * drro->drr_blksz,
DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
} else if (err == EEXIST) {
/*
* The object requested is currently an interior slot of a
Expand Down Expand Up @@ -1333,14 +1363,24 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
/* handle more restrictive dnode structuring for raw recvs */
if (rwa->raw) {
/*
* Set the indirect block shift and nlevels. This will not fail
* because we ensured all of the blocks were free earlier if
* this is a new object.
* Set the indirect block size, block shift, nlevels.
* This will not fail because we ensured all of the
* blocks were freed earlier if this is a new object.
* For non-new objects block size and indirect block
* shift cannot change and nlevels can only increase.
*/
VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
drro->drr_blksz, drro->drr_indblkshift, tx));
VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
drro->drr_nlevels, tx));

/*
* Set the maxblkid. We will never free the first block of
* an object here because a maxblkid of 0 could indicate
* an object with a single block or one with no blocks.
* This will always succeed because we freed all blocks
* beyond the new maxblkid above.
*/
VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
drro->drr_maxblkid, tx));
}
Expand Down
26 changes: 21 additions & 5 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1828,7 +1828,8 @@ dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)

/* read-holding callers must not rely on the lock being continuously held */
void
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
boolean_t force)
{
int epbs, new_nlevels;
uint64_t sz;
Expand All @@ -1853,14 +1854,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
}
}

if (blkid <= dn->dn_maxblkid)
/*
* Raw sends (indicated by the force flag) require that we take the
* given blkid even if the value is lower than the current value.
*/
if (!force && blkid <= dn->dn_maxblkid)
goto out;

/*
* We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
* to indicate that this field is set. This allows us to set the
* maxblkid to 0 on an existing object in dnode_sync().
*/
dn->dn_maxblkid = blkid;
dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid;
dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
blkid | DMU_NEXT_MAXBLKID_SET;

/*
* Compute the number of levels necessary to support the new maxblkid.
* Raw sends will ensure nlevels is set correctly for us.
*/
new_nlevels = 1;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
Expand All @@ -1870,8 +1882,12 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)

ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);

if (new_nlevels > dn->dn_nlevels)
dnode_set_nlevels_impl(dn, new_nlevels, tx);
if (!force) {
if (new_nlevels > dn->dn_nlevels)
dnode_set_nlevels_impl(dn, new_nlevels, tx);
} else {
ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
}

out:
if (have_read)
Expand Down
13 changes: 5 additions & 8 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
}
}

/*
* Do not truncate the maxblkid if we are performing a raw
* receive. The raw receive sets the mablkid manually and
* must not be overriden.
*/
if (trunc && !dn->dn_objset->os_raw_receive) {
if (trunc) {
ASSERTV(uint64_t off);
dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;

Expand Down Expand Up @@ -765,11 +760,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)

/*
* This must be done after dnode_sync_free_range()
* and dnode_increase_indirection().
* and dnode_increase_indirection(). See dnode_new_blkid()
* for an explanation of the high bit being set.
*/
if (dn->dn_next_maxblkid[txgoff]) {
mutex_enter(&dn->dn_mtx);
dnp->dn_maxblkid = dn->dn_next_maxblkid[txgoff];
dnp->dn_maxblkid =
dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
dn->dn_next_maxblkid[txgoff] = 0;
mutex_exit(&dn->dn_mtx);
}
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dsl_crypt.c
Original file line number Diff line number Diff line change
Expand Up @@ -2099,7 +2099,7 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
mdn->dn_checksum = checksum;

rw_enter(&mdn->dn_struct_rwlock, RW_WRITER);
dnode_new_blkid(mdn, maxblkid, tx, B_FALSE);
dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE);
rw_exit(&mdn->dn_struct_rwlock);

/*
Expand Down
10 changes: 0 additions & 10 deletions tests/zfs-tests/cmd/xattrtest/xattrtest.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,6 @@ parse_args(int argc, char **argv)
break;
case 'y':
verify = 1;
if (phase != PHASE_ALL) {
fprintf(stderr,
"Error: -y and -o are incompatible.\n");
rc = 1;
}
break;
case 'n':
nth = strtol(optarg, NULL, 0);
Expand Down Expand Up @@ -201,11 +196,6 @@ parse_args(int argc, char **argv)
PHASE_ALL, PHASE_INVAL);
rc = 1;
}
if (verify == 1) {
fprintf(stderr,
"Error: -y and -o are incompatible.\n");
rc = 1;
}
break;
default:
rc = 1;
Expand Down
81 changes: 65 additions & 16 deletions tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@
# 3. Add a small 512 byte file to the filesystem
# 4. Add a larger 32M file to the filesystem
# 5. Add a large sparse file to the filesystem
# 6. Add a file truncated to 4M to the filesystem
# 7. Add a sparse file with metadata compression disabled to the filesystem
# 8. Add and remove 1000 empty files to the filesystem
# 9. Add a file with a large xattr value
# 10. Snapshot the filesystem
# 11. Send and receive the filesystem, ensuring that it can be mounted
# 6. Add a 3 files that are to be truncated later
# 7. Add 1000 empty files to the filesystem
# 8. Add a file with a large xattr value
# 9. Use xattrtest to create files with random xattrs (with and without xattrs=on)
# 10. Take a snapshot of the filesystem
# 11. Truncate one of the files from 32M to 128k
# 12. Truncate one of the files from 512k to 384k
# 13. Truncate one of the files from 512k to 0 to 384k
# 14. Remove the 1000 empty files to the filesystem
# 15. Take another snapshot of the filesystem
# 16. Send and receive both snapshots
# 17. Mount the filesystem and check the contents
#

verify_runnable "both"
Expand All @@ -51,46 +57,89 @@ function cleanup
}
log_onexit cleanup

function recursive_cksum
{
find $1 -type f -exec sha256sum {} \; | \
sort -k 2 | awk '{ print $1 }' | sha256sum
}

log_assert "Verify 'zfs send -w' works with many different file layouts"

typeset keyfile=/$TESTPOOL/pkey
typeset sendfile=/$TESTPOOL/sendfile
typeset sendfile2=/$TESTPOOL/sendfile2

# Create an encrypted dataset
log_must eval "echo 'password' > $keyfile"
log_must zfs create -o encryption=on -o keyformat=passphrase \
-o keylocation=file://$keyfile $TESTPOOL/$TESTFS2

# Create files with vaired layouts on disk
log_must touch /$TESTPOOL/$TESTFS2/empty
log_must mkfile 512 /$TESTPOOL/$TESTFS2/small
log_must mkfile 32M /$TESTPOOL/$TESTFS2/full
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/sparse \
bs=512 count=1 seek=10G >/dev/null 2>&1
bs=512 count=1 seek=1048576 >/dev/null 2>&1
log_must mkfile 32M /$TESTPOOL/$TESTFS2/truncated
log_must truncate -s 4M /$TESTPOOL/$TESTFS2/truncated
sync
log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated2
log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated3

log_must mkdir -p /$TESTPOOL/$TESTFS2/dir
for i in {1..1000}; do
log_must mkfile 512 /$TESTPOOL/$TESTFS2/dir/file-$i
done
sync

log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrondir
log_must zfs set xattr=on $TESTPOOL/$TESTFS2
log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrondir
log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrsadir
log_must zfs set xattr=sa $TESTPOOL/$TESTFS2
log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrsadir

# ZoL issue #7432
log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
log_must touch /$TESTPOOL/$TESTFS2/attrs
log_must eval "python -c 'print \"a\" * 4096' | \
attr -s bigval /$TESTPOOL/$TESTFS2/attrs"

log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1

#
# Truncate files created in the first snapshot. The first tests
# truncating a large file to a single block. The second tests
# truncating one block off the end of a file without changing
# the required nlevels to hold it. The last tests handling
# of a maxblkid that is dropped and then raised again.
#
log_must truncate -s 131072 /$TESTPOOL/$TESTFS2/truncated
log_must truncate -s 393216 /$TESTPOOL/$TESTFS2/truncated2
log_must truncate -s 0 /$TESTPOOL/$TESTFS2/truncated3
log_must zpool sync $TESTPOOL
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/truncated3 \
bs=128k count=3 iflag=fullblock

# Remove the empty files created in the first snapshot
for i in {1..1000}; do
log_must rm /$TESTPOOL/$TESTFS2/dir/file-$i
done
sync

# ZoL issue #7432
log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
log_must touch /$TESTPOOL/$TESTFS2/attrs
log_must eval "python -c 'print \"a\" * 4096' | attr -s bigval /$TESTPOOL/$TESTFS2/attrs"
log_must zfs snapshot $TESTPOOL/$TESTFS2@snap2
expected_cksum=$(recursive_cksum /$TESTPOOL/$TESTFS2)

log_must zfs snapshot $TESTPOOL/$TESTFS2@now
log_must eval "zfs send -wR $TESTPOOL/$TESTFS2@now > $sendfile"
log_must eval "zfs send -wp $TESTPOOL/$TESTFS2@snap1 > $sendfile"
log_must eval "zfs send -wp -i @snap1 $TESTPOOL/$TESTFS2@snap2 > $sendfile2"

log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile"
log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile2"
log_must zfs load-key $TESTPOOL/recv

log_must zfs mount -a
actual_cksum=$(recursive_cksum /$TESTPOOL/recv)
[[ "$expected_cksum" != "$actual_cksum" ]] && \
log_fail "Recursive checksums differ ($expected_cksum != $actual_cksum)"

log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrondir
log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrsadir

log_pass "Verified 'zfs send -w' works with many different file layouts"

0 comments on commit 369aa50

Please sign in to comment.