Skip to content

Commit

Permalink
9318 vol_volsize_to_reservation does not account for raidz skip blocks
Browse files Browse the repository at this point in the history
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
  • Loading branch information
Mike Gerdts committed Jun 20, 2019
1 parent 807b45f commit 969a6b1
Show file tree
Hide file tree
Showing 9 changed files with 517 additions and 15 deletions.
7 changes: 4 additions & 3 deletions usr/src/cmd/zfs/zfs_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
Expand Down Expand Up @@ -876,10 +876,11 @@ zfs_do_create(int argc, char **argv)
zpool_close(zpool_handle);
goto error;
}
zpool_close(zpool_handle);

volsize = zvol_volsize_to_reservation(volsize, real_props);
volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
real_props);
nvlist_free(real_props);
zpool_close(zpool_handle);

if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
&strval) != 0) {
Expand Down
3 changes: 2 additions & 1 deletion usr/src/lib/libzfs/common/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *,
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t,
nvlist_t *);

typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
Expand Down
193 changes: 185 additions & 8 deletions usr/src/lib/libzfs/common/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
Expand Down Expand Up @@ -1514,6 +1514,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
uint64_t new_reservation;
zfs_prop_t resv_prop;
nvlist_t *props;
zpool_handle_t *zph = zpool_handle(zhp);

/*
* If this is an existing volume, and someone is setting the volsize,
Expand All @@ -1528,7 +1529,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));

if ((zvol_volsize_to_reservation(old_volsize, props) !=
if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
old_reservation) || nvlist_exists(nvl,
zfs_prop_to_name(resv_prop))) {
fnvlist_free(props);
Expand All @@ -1539,7 +1540,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_free(props);
return (-1);
}
new_reservation = zvol_volsize_to_reservation(new_volsize, props);
new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
fnvlist_free(props);

if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
Expand Down Expand Up @@ -1594,7 +1595,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
}

resvsize = zvol_volsize_to_reservation(volsize, props);
resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
props);
fnvlist_free(props);

(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
Expand Down Expand Up @@ -5111,12 +5113,180 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
}

/*
* Convert the zvol's volume size to an appropriate reservation.
* The theory of raidz space accounting
*
* The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
* will "reference" 128KB, even though it allocates more than that, to store the
* parity information (and perhaps skip sectors). This concept of the
* "referenced" (and other DMU space accounting) being lower than the allocated
* space by a constant factor is called "raidz deflation."
*
* As mentioned above, the constant factor for raidz deflation assumes a 128KB
* block size. However, zvols typically have a much smaller block size (default
* 8KB). These smaller blocks may require proportionally much more parity
* information (and perhaps skip sectors). In this case, the change to the
* "referenced" property may be much more than the logical block size.
*
* Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written
* as follows.
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | D8 | D16 | D24 |
* | P1 | D1 | D9 | D17 | D25 |
* | P2 | D2 | D10 | D18 | D26 |
* | P3 | D3 | D11 | D19 | D27 |
* | P4 | D4 | D12 | D20 | D28 |
* | P5 | D5 | D13 | D21 | D29 |
* | P6 | D6 | D14 | D22 | D30 |
* | P7 | D7 | D15 | D23 | D31 |
* +-------+-------+-------+-------+-------+
*
* Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
* sectors. The dataset's referenced will increase by 128k and the pool's
* allocated and free properties will be adjusted by 160k.
*
* A 4k block written to the same raidz vdev will require two 4k sectors. The
* blank cells represent unallocated space.
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | | | |
* +-------+-------+-------+-------+-------+
*
* Above, notice that the 4k block required one sector for parity and another
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
* and free properties will be adjusted by 8k. The dataset will not be charged
* 8k. Rather, it will be charged a value that is scaled according to the
* overhead of the 128k block on the same vdev. This 8k allocation will be
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
* calculated in the 128k block example above.
*
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
* allocations are a multiple of 3 sectors, and raidz3 allocations are a
* multiple of of 4 sectors. When a block does not fill the required number of
* sectors, skip blocks (sectors) are used.
*
* An 8k block being written to a raidz vdev may be written as follows:
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | D1 | S0 | |
* +-------+-------+-------+-------+-------+
*
* In order to maintain the nparity+1 allocation size, a skip block (S0) was
* added. For this 8k block, the pool's allocated and free properties are
* adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
* the 128k block example above.
*
* Compression may lead to a variety of block sizes being written for the same
* volume or file. There is no clear way to reserve just the amount of space
* that will be required, so the worst case (no compression) is assumed.
* Note that metadata blocks will typically be compressed, so the reservation
* size returned by zvol_volsize_to_reservation() will generally be slightly
* larger than the maximum that the volume can reference.
*/

/*
* Derived from function of same name in uts/common/fs/zfs/vdev_raidz.c.
* Returns the amount of space (in bytes) that will be allocated for the
* specified block size. Note that the "referenced" space accounted will be less
* than this, but not necessarily equal to "blksize", due to RAIDZ deflation.
*/
static uint64_t
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
uint64_t blksize)
{
uint64_t asize, ndata;

ASSERT3U(ndisks, >, nparity);
ndata = ndisks - nparity;
asize = ((blksize - 1) >> ashift) + 1;
asize += nparity * ((asize + ndata - 1) / ndata);
asize = roundup(asize, nparity + 1) << ashift;

return (asize);
}

/*
* Determine how much space will be allocated if it lands on the most space-
* inefficient top-level vdev. Returns the size in bytes required to store one
* copy of the volume data. See theory comment above.
*/
static uint64_t
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
{
nvlist_t *config, *tree, **vdevs;
uint_t nvdevs, v;
uint64_t ret = 0;

config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
&vdevs, &nvdevs) != 0) {
return (nblocks * blksize);
}

for (v = 0; v < nvdevs; v++) {
char *type;
uint64_t nparity, ashift, asize, tsize;
nvlist_t **disks;
uint_t ndisks;
uint64_t volsize;

if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
&type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
&nparity) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
&ashift) != 0 ||
nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
&disks, &ndisks) != 0) {
continue;
}

/* allocation size for the "typical" 128k block */
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
SPA_OLD_MAXBLOCKSIZE);
/* allocation size for the blksize block */
asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);

/*
* Scale this size down as a ratio of 128k / tsize. See theory
* statement above.
*
* Note that SPA_OLD_MAXBLOCKSIZE will always be less than
* tsize, so the integer division SPA_OLD_MAXBLOCKSIZE / tsize
* will be 0. The order of operations below matters.
*/
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
if (volsize > ret) {
ret = volsize;
}
}

if (ret == 0) {
ret = nblocks * blksize;
}

return (ret);
}

/*
* Convert the zvol's volume size to an appropriate reservation. See theory
* comment above.
*
* Note: If this routine is updated, it is necessary to update the ZFS test
* suite's shell version in reservation.kshlib.
* suite's shell version in reservation.shlib.
*/
uint64_t
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
nvlist_t *props)
{
uint64_t numdb;
uint64_t nblocks, volblocksize;
Expand All @@ -5132,7 +5302,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize) != 0)
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
nblocks = volsize/volblocksize;

nblocks = volsize / volblocksize;
/*
* Metadata defaults to using 128k blocks, not volblocksize blocks. For
* this reason, only the data blocks are scaled based on vdev config.
*/
volsize = volsize_from_vdevs(zph, nblocks, volblocksize);

/* start with metadnode L0-L6 */
numdb = 7;
/* calculate number of indirects */
Expand Down
3 changes: 3 additions & 0 deletions usr/src/pkg/manifests/system-test-zfstest.mf
Original file line number Diff line number Diff line change
Expand Up @@ -2521,6 +2521,9 @@ file path=opt/zfs-tests/tests/functional/refreserv/refreserv_002_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_003_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_004_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_005_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz \
mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_raidz mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/setup mode=0555
file path=opt/zfs-tests/tests/functional/removal/cleanup mode=0555
file path=opt/zfs-tests/tests/functional/removal/removal.kshlib mode=0444
Expand Down
3 changes: 2 additions & 1 deletion usr/src/test/zfs-tests/runfiles/delphix.run
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',

[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
'refreserv_004_pos', 'refreserv_005_pos']
'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
'refreserv_multi_raidz']

[/opt/zfs-tests/tests/functional/removal]
pre =
Expand Down
3 changes: 2 additions & 1 deletion usr/src/test/zfs-tests/runfiles/omnios.run
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',

[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
'refreserv_004_pos', 'refreserv_005_pos']
'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
'refreserv_multi_raidz']

[/opt/zfs-tests/tests/functional/removal]
pre =
Expand Down
3 changes: 2 additions & 1 deletion usr/src/test/zfs-tests/runfiles/openindiana.run
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',

[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
'refreserv_004_pos', 'refreserv_005_pos']
'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
'refreserv_multi_raidz']

[/opt/zfs-tests/tests/functional/removal]
pre =
Expand Down
Loading

0 comments on commit 969a6b1

Please sign in to comment.