Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

single-chunk scatter ABDs can be treated as linear #8580

Merged
merged 1 commit into from Jun 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 11 additions & 2 deletions include/sys/abd.h
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/

#ifndef _ABD_H
Expand All @@ -44,7 +44,8 @@ typedef enum abd_flags {
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
} abd_flags_t;

typedef struct abd {
Expand All @@ -60,6 +61,7 @@ typedef struct abd {
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
} abd_u;
} abd_t;
Expand All @@ -75,6 +77,13 @@ abd_is_linear(abd_t *abd)
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
}

static inline boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
}

/*
* Allocations and deallocations
*/
Expand Down
120 changes: 79 additions & 41 deletions module/zfs/abd.c
Expand Up @@ -72,17 +72,19 @@
* (2) Fragmentation is less of an issue since when we are at the limit of
* allocatable space, we won't have to search around for a long free
* hole in the VA space for large ARC allocations. Each chunk is mapped in
* individually, so even if we weren't using segkpm (see next point) we
* individually, so even if we are using HIGHMEM (see next point) we
* wouldn't need to worry about finding a contiguous address range.
*
* (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
* on each ABD access. (If segkpm isn't available then we use all linear
* ABDs to avoid this penalty.) See seg_kpm.c for more details.
* (3) If we are not using HIGHMEM, then all physical memory is always
* mapped into the kernel's address space, so we also avoid the map /
* unmap costs on each ABD access.
*
* If we are not using HIGHMEM, scattered buffers which have only one chunk
* can be treated as linear buffers, because they are contiguous in the
* kernel's virtual address space. See abd_alloc_pages() for details.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
* B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
* available, which is the case on all 32-bit systems and any 64-bit systems
* where kpm_enable is turned off.
* B_FALSE.
*
* In addition to directly allocating a linear or scattered ABD, it is also
* possible to create an ABD by requesting the "sub-ABD" starting at an offset
Expand Down Expand Up @@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
#define __GFP_RECLAIM __GFP_WAIT
#endif

static unsigned long
abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
{
struct page *page;

page = alloc_pages_node(nid, gfp, order);
if (!page)
return (0);

return ((unsigned long) page_address(page));
}

/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
Expand All @@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
size_t remaining_size;
int nid = NUMA_NO_NODE;
int alloc_pages = 0;
int order;

INIT_LIST_HEAD(&pages);

while (alloc_pages < nr_pages) {
unsigned long paddr;
unsigned chunk_pages;
int order;

order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
chunk_pages = (1U << order);

paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
if (paddr == 0) {
page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
if (page == NULL) {
if (order == 0) {
ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
schedule_timeout_interruptible(1);
Expand All @@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
continue;
}

page = virt_to_page(paddr);
list_add_tail(&page->lru, &pages);

if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
Expand Down Expand Up @@ -336,18 +324,52 @@ abd_alloc_pages(abd_t *abd, size_t size)
list_del(&page->lru);
}

if (chunks > 1) {
/*
* These conditions ensure that a possible transformation to a linear
* ABD would be valid.
*/
ASSERT(!PageHighMem(sg_page(table.sgl)));
ASSERT0(ABD_SCATTER(abd).abd_offset);

if (table.nents == 1) {
behlendorf marked this conversation as resolved.
Show resolved Hide resolved
/*
* Since there is only one entry, this ABD can be represented
* as a linear buffer. All single-page (4K) ABD's can be
* represented this way. Some multi-page ABD's can also be
* represented this way, if we were able to allocate a single
* "chunk" (higher-order "page" which represents a power-of-2
* series of physically-contiguous pages). This is often the
* case for 2-page (8K) ABD's.
*
* Representing a single-entry scatter ABD as a linear ABD
* has the performance advantage of avoiding the copy (and
* allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
* A performance increase of around 5% has been observed for
* ARC-cached reads (of small blocks which can take advantage
* of this).
*
* Note that this optimization is only possible because the
* pages are always mapped into the kernel's address space.
* This is not the case for highmem pages, so the
* optimization can not be made there.
*/
abd->abd_flags |= ABD_FLAG_LINEAR;
abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
abd->abd_u.abd_linear.abd_sgl = table.sgl;
abd->abd_u.abd_linear.abd_buf =
page_address(sg_page(table.sgl));
} else if (table.nents > 1) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;

if (zones) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
}
}

ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = table.nents;
ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = table.nents;
}
}
#else
/*
Expand Down Expand Up @@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)

struct page;

#define kpm_enable 1
#define abd_alloc_chunk(o) \
((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o))
#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
Expand Down Expand Up @@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);

abd_for_each_sg(abd, sg, nr_pages, i) {
struct page *p = abd_alloc_chunk(0);
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
Expand All @@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
int j;

abd_for_each_sg(abd, sg, n, i) {
for (j = 0; j < sg->length; j += PAGESIZE) {
struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
abd_free_chunk(p, 0);
for (int j = 0; j < sg->length; j += PAGESIZE) {
struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
umem_free(p, PAGESIZE);
}
}

Expand Down Expand Up @@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK));
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
Expand Down Expand Up @@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)

abd_t *abd = abd_alloc_struct();
abd->abd_flags = ABD_FLAG_OWNER;
abd->abd_u.abd_scatter.abd_offset = 0;
abd_alloc_pages(abd, size);

if (is_metadata) {
Expand All @@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);

abd->abd_u.abd_scatter.abd_offset = 0;

ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
Expand Down Expand Up @@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
static void
abd_free_linear(abd_t *abd)
{
if (abd_is_linear_page(abd)) {
/* Transform it back into a scatter ABD for freeing */
struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
abd->abd_flags &= ~ABD_FLAG_LINEAR;
abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
ABD_SCATTER(abd).abd_nents = 1;
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_sgl = sg;
abd_free_scatter(abd);
return;
}
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
} else {
Expand Down Expand Up @@ -718,7 +745,8 @@ abd_t *
abd_alloc_sametype(abd_t *sabd, size_t size)
{
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
if (abd_is_linear(sabd)) {
if (abd_is_linear(sabd) &&
!abd_is_linear_page(sabd)) {
return (abd_alloc_linear(size, is_metadata));
} else {
return (abd_alloc(size, is_metadata));
Expand Down Expand Up @@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);

/*
* abd_free() needs to handle LINEAR_PAGE ABD's specially.
* Since that flag does not survive the
* abd_release_ownership_of_buf() -> abd_get_from_buf() ->
* abd_take_ownership_of_buf() sequence, we don't allow releasing
* these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
*/
ASSERT(!abd_is_linear_page(abd));

abd_verify(abd);

abd->abd_flags &= ~ABD_FLAG_OWNER;
Expand Down
19 changes: 13 additions & 6 deletions module/zfs/arc.c
Expand Up @@ -2917,18 +2917,26 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
/*
* If the hdr's data can be shared then we share the data buffer and
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
* allocate a new buffer to store the buf's data.
* sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
* buffer to store the buf's data.
*
* There are two additional restrictions here because we're sharing
* hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
* actively involved in an L2ARC write, because if this buf is used by
* an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
* Second, the hdr's ABD must be linear so that the buf's user doesn't
* need to be ABD-aware.
*/
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
* need to be ABD-aware. It must be allocated via
* zio_[data_]buf_alloc(), not as a page, because we need to be able
* to abd_release_ownership_of_buf(), which isn't allowed on "linear
* page" buffers because the ABD code needs to handle freeing them
* specially.
*/
boolean_t can_share = arc_can_share(hdr, buf) &&
!HDR_L2_WRITING(hdr) &&
hdr->b_l1hdr.b_pabd != NULL &&
abd_is_linear(hdr->b_l1hdr.b_pabd) &&
!abd_is_linear_page(hdr->b_l1hdr.b_pabd);

/* Set up b_data and sharing */
if (can_share) {
Expand Down Expand Up @@ -3731,7 +3739,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
arc_hdr_free_abd(hdr, B_FALSE);
arc_share_buf(hdr, buf);
}
Expand Down
6 changes: 0 additions & 6 deletions module/zfs/zio.c
Expand Up @@ -331,12 +331,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);

/*
* Ensure that anyone expecting this zio to contain a linear ABD isn't
* going to get a nasty surprise when they try to access the data.
*/
IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));

zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize;
Expand Down