Skip to content

Commit

Permalink
Introduce ARC Buffer Data (ABD)
Browse files Browse the repository at this point in the history
ZFS currently uses ARC buffers which are backed by virtual memory.
While functional, there are some major problems with this approach
which can be observed on all OpenZFS platforms.  ABD was designed
to address these issues and includes contributions from OpenZFS
developers from multiple platforms.

While all OpenZFS platforms will benefit from ABD this functionality
is critical for Linux.  Unlike the other OpenZFS platforms the Linux
kernel discourages extensive use of virtual memory.  The provided
interfaces are not optimized for frequent allocations from the virtual
address space.  To maintain good performance a kmem cache is
used which contains relatively long lived slabs backed by virtual
memory.  The downside to the approach is that those slabs can
become highly fragmented resulting in an inefficient use of memory.

Another issue is that on 32-bit systems the available virtual
address space in the kernel is only a small fraction of total
system memory.  This means the ARC size is highly constrained
which hurts performance and make allocating memory difficult
and OOMs more likely.

ABD is designed to address these issues by using scatter lists
of pages for data buffers.  This removes the need for slabs
which resolves the fragmentation issue.  It also allows high
memory pages to be allocated which alleviates the virtual
address space pressure on 32-bit systems.

For metadata buffers, which are small, linear ABDs are allocated
from the slab.  This is preferable because there are many places
in the code which expect to be able to read from a given offset
in the buffer.  Using linear ABDs means none of that code needs
to be modified.  The majority of these buffers are allocated with
kmalloc so there's minimal impact of the virtual address space.

Tested-by: Kash Pande <kash@tripleback.net>
Tested-by: kernelOfTruth <kerneloftruth@gmail.com>
Tested-by: RageLtMan <rageltman@sempervictus>
Tested-by: DHE <git@dehacked.net>
Reviewed-by: Chunwei Chen <david.chen@osnexus.com>
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: David Quigley <david.quigley@intel.com>
Reviewed-by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3441 
Closes #5135
  • Loading branch information
behlendorf committed Nov 30, 2016
2 parents ce43e88 + 9829574 commit 7657def
Show file tree
Hide file tree
Showing 59 changed files with 5,023 additions and 2,257 deletions.
8 changes: 4 additions & 4 deletions cmd/raidz_test/raidz_bench.c
Expand Up @@ -53,18 +53,18 @@ bench_init_raidz_map(void)

/*
* To permit larger column sizes these have to be done
* allocated using aligned alloc instead of zio_data_buf_alloc
* allocated using aligned alloc instead of zio_abd_buf_alloc
*/
zio_bench.io_data = raidz_alloc(max_data_size);
zio_bench.io_abd = raidz_alloc(max_data_size);

init_zio_data(&zio_bench);
init_zio_abd(&zio_bench);
}

static void
bench_fini_raidz_maps(void)
{
/* tear down golden zio */
raidz_free(zio_bench.io_data, max_data_size);
raidz_free(zio_bench.io_abd, max_data_size);
bzero(&zio_bench, sizeof (zio_t));
}

Expand Down
53 changes: 28 additions & 25 deletions cmd/raidz_test/raidz_test.c
Expand Up @@ -181,10 +181,10 @@ static void process_options(int argc, char **argv)
}
}

#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data)
#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)

#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data)
#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)

static int
Expand All @@ -195,10 +195,9 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
VERIFY(parity >= 1 && parity <= 3);

for (i = 0; i < parity; i++) {
if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i),
CODE_COL_SIZE(rm, i))) {
if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
!= 0) {
ret++;

LOG_OPT(D_DEBUG, opts,
"\nParity block [%d] different!\n", i);
}
Expand All @@ -213,8 +212,8 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);

for (i = 0; i < dcols; i++) {
if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i),
DATA_COL_SIZE(opts->rm_golden, i))) {
if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
!= 0) {
ret++;

LOG_OPT(D_DEBUG, opts,
Expand All @@ -224,37 +223,41 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
return (ret);
}

static int
init_rand(void *data, size_t size, void *private)
{
int i;
int *dst = (int *) data;

for (i = 0; i < size / sizeof (int); i++)
dst[i] = rand_data[i];

return (0);
}

static void
corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
{
int i;
int *dst;
raidz_col_t *col;

for (i = 0; i < cnt; i++) {
col = &rm->rm_col[tgts[i]];
dst = col->rc_data;
for (i = 0; i < col->rc_size / sizeof (int); i++)
dst[i] = rand();
abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
}
}

void
init_zio_data(zio_t *zio)
init_zio_abd(zio_t *zio)
{
int i;
int *dst = (int *) zio->io_data;

for (i = 0; i < zio->io_size / sizeof (int); i++) {
dst[i] = rand_data[i];
}
abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
}

static void
fini_raidz_map(zio_t **zio, raidz_map_t **rm)
{
vdev_raidz_map_free(*rm);
raidz_free((*zio)->io_data, (*zio)->io_size);
raidz_free((*zio)->io_abd, (*zio)->io_size);
umem_free(*zio, sizeof (zio_t));

*zio = NULL;
Expand All @@ -279,11 +282,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;

opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize);
zio_test->io_data = raidz_alloc(opts->rto_dsize);
opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
zio_test->io_abd = raidz_alloc(opts->rto_dsize);

init_zio_data(opts->zio_golden);
init_zio_data(zio_test);
init_zio_abd(opts->zio_golden);
init_zio_abd(zio_test);

VERIFY0(vdev_raidz_impl_set("original"));

Expand Down Expand Up @@ -326,8 +329,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)

(*zio)->io_offset = 0;
(*zio)->io_size = alloc_dsize;
(*zio)->io_data = raidz_alloc(alloc_dsize);
init_zio_data(*zio);
(*zio)->io_abd = raidz_alloc(alloc_dsize);
init_zio_abd(*zio);

rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
Expand Down
6 changes: 3 additions & 3 deletions cmd/raidz_test/raidz_test.h
Expand Up @@ -104,11 +104,11 @@ static inline size_t ilog2(size_t a)
#define SEP "----------------\n"


#define raidz_alloc(size) zio_data_buf_alloc(size)
#define raidz_free(p, size) zio_data_buf_free(p, size)
#define raidz_alloc(size) abd_alloc(size, B_FALSE)
#define raidz_free(p, size) abd_free(p)


void init_zio_data(zio_t *zio);
void init_zio_abd(zio_t *zio);

void run_raidz_benchmark(void);

Expand Down
48 changes: 28 additions & 20 deletions cmd/zdb/zdb.c
Expand Up @@ -59,6 +59,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <zfs_comutil.h>
#include <libzfs.h>

Expand Down Expand Up @@ -2464,7 +2465,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;

zio_data_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);

mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
Expand Down Expand Up @@ -2530,7 +2531,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;

/* If it's an intent log block, failure is expected. */
Expand All @@ -2543,7 +2544,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);

zio_nowait(zio_read(NULL, spa, bp, data, size,
zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}

Expand Down Expand Up @@ -3321,6 +3322,13 @@ zdb_vdev_lookup(vdev_t *vdev, char *path)
return (NULL);
}

/* ARGSUSED */
static int
random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
{
return (random_get_pseudo_bytes(buf, len));
}

/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
Expand Down Expand Up @@ -3352,7 +3360,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
void *pbuf, *lbuf, *buf;
abd_t *pabd;
void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;

Expand Down Expand Up @@ -3425,8 +3434,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;

/* Some 4K native devices require 4K buffer alignment */
pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL);
pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

BP_ZERO(bp);
Expand Down Expand Up @@ -3454,15 +3462,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
Expand All @@ -3485,13 +3493,13 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

bcopy(pbuf, pbuf2, psize);
abd_copy_to_buf(pbuf2, pabd, psize);

VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
random_get_pseudo_bytes_cb, NULL));

VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize));

/*
* XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB,
Expand All @@ -3506,10 +3514,10 @@ zdb_read_block(char *thing, spa_t *spa)
"Trying %05llx -> %05llx (%s)\n",
(u_longlong_t)psize, (u_longlong_t)lsize,
zio_compress_table[c].ci_name);
if (zio_decompress_data(c, pbuf, lbuf,
psize, lsize) == 0 &&
zio_decompress_data(c, pbuf2, lbuf2,
psize, lsize) == 0 &&
if (zio_decompress_data(c, pabd,
lbuf, psize, lsize) == 0 &&
zio_decompress_data_buf(c, pbuf2,
lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
Expand All @@ -3527,7 +3535,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
buf = pbuf;
buf = abd_to_buf(pabd);
size = psize;
}

Expand All @@ -3545,7 +3553,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);

out:
umem_free(pbuf, SPA_MAXBLOCKSIZE);
abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
Expand Down

0 comments on commit 7657def

Please sign in to comment.