Skip to content

Commit

Permalink
OpenZFS 9337 - zfs get all is slow due to uncached metadata
Browse files Browse the repository at this point in the history
This project's goal is to make read-heavy channel programs and zfs(1m)
administrative commands faster by caching all the metadata that they will
need in the dbuf layer. This will prevent the data from being evicted, so
that any future call to i.e. zfs get all won't have to go to disk (very
much). There are two parts:

The dbuf_metadata_cache. We identify what to put into the cache based on
the object type of each dbuf.  Caching objset properties os
{version,normalization,utf8only,casesensitivity} in the objset_t. The reason
these needed to be cached is that although they are queried frequently,
they aren't stored in a dbuf type which we can easily recognize and cache in
the dbuf layer; instead, we have to explicitly store them. There's already
existing infrastructure for maintaining cached properties in the objset
setup code, so I simply used that.

Performance Testing:

 - Disabled kmem_flags
 - Tuned dbuf_cache_max_bytes very low (128K)
 - Tuned zfs_arc_max very low (64M)

Created test pool with 400 filesystems, and 100 snapshots per filesystem.
Later on in testing, added 600 more filesystems (with no snapshots) to make
sure scaling didn't look different between snapshots and filesystems.

Results:

    | Test                   | Time (trunk / diff) | I/Os (trunk / diff) |
    +------------------------+---------------------+---------------------+
    | zpool import           |     0:05 / 0:06     |    12.9k / 12.9k    |
    | zfs get all (uncached) |     1:36 / 0:53     |    16.7k / 5.7k     |
    | zfs get all (cached)   |     1:36 / 0:51     |    16.0k / 6.0k     |

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Ported-by: Alek Pinchuk <apinchuk@datto.com>
Signed-off-by: Alek Pinchuk <apinchuk@datto.com>

OpenZFS-issue: https://illumos.org/issues/9337
OpenZFS-commit: openzfs/openzfs@7dec52f
Closes #7668
  • Loading branch information
ahrens authored and behlendorf committed Jul 12, 2018
1 parent e4e94ca commit 2e5dc44
Show file tree
Hide file tree
Showing 17 changed files with 405 additions and 164 deletions.
16 changes: 12 additions & 4 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ typedef enum dbuf_states {
DB_EVICTING
} dbuf_states_t;

typedef enum dbuf_cached_state {
DB_NO_CACHE = -1,
DB_DBUF_CACHE,
DB_DBUF_METADATA_CACHE,
DB_CACHE_MAX
} dbuf_cached_state_t;

struct dnode;
struct dmu_tx;

Expand Down Expand Up @@ -240,11 +247,12 @@ typedef struct dmu_buf_impl {
*/
avl_node_t db_link;

/*
* Link in dbuf_cache.
*/
/* Link in dbuf_cache or dbuf_metadata_cache */
multilist_node_t db_cache_link;

/* Tells us which dbuf cache this dbuf is in, if any */
dbuf_cached_state_t db_caching_status;

/* Data which is unique to data (leaf) blocks: */

/* User callback information. */
Expand Down Expand Up @@ -305,7 +313,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
uint64_t dbuf_refcount(dmu_buf_impl_t *db);

void dbuf_rele(dmu_buf_impl_t *db, void *tag);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);

dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid);
Expand Down
7 changes: 6 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ typedef enum dmu_object_byteswap {
/*
* Defines a uint8_t object type. Object types specify if the data
* in the object is metadata (boolean) and how to byteswap the data
* (dmu_object_byteswap_t).
* (dmu_object_byteswap_t). All of the types created by this method
* are cached in the dbuf metadata cache.
*/
#define DMU_OT(byteswap, metadata, encrypted) \
(DMU_OT_NEWTYPE | \
Expand All @@ -119,6 +120,9 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
(ot) < DMU_OT_NUMTYPES)

#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)

/*
* MDB doesn't have dmu_ot; it defines these macros itself.
*/
Expand Down Expand Up @@ -883,6 +887,7 @@ typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
typedef struct dmu_object_type_info {
dmu_object_byteswap_t ot_byteswap;
boolean_t ot_metadata;
boolean_t ot_dbuf_metadata_cache;
boolean_t ot_encrypt;
char *ot_name;
} dmu_object_type_info_t;
Expand Down
12 changes: 12 additions & 0 deletions include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/sa.h>
#include <sys/zfs_ioctl.h>

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -90,6 +91,7 @@ typedef struct objset_phys {

typedef int (*dmu_objset_upgrade_cb_t)(objset_t *);

#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
struct objset {
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
Expand Down Expand Up @@ -125,6 +127,16 @@ struct objset {
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before
* being initialized with their real values, their values are
* OBJSET_PROP_UNINITIALIZED.
*/
uint64_t os_version;
uint64_t os_normalization;
uint64_t os_utf8only;
uint64_t os_casesensitivity;

/*
* Pointer is constant; the blkptr it points to is protected by
Expand Down
2 changes: 1 addition & 1 deletion include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
void dnode_rele_and_unlock(dnode_t *dn, void *tag);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
Expand Down
1 change: 0 additions & 1 deletion include/sys/zfs_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,6 @@ extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *);
extern int zfs_secpolicy_destroy_perms(const char *, cred_t *);
extern void zfs_unmount_snap(const char *);
extern void zfs_destroy_unmount_origin(const char *);
extern boolean_t dataset_name_hidden(const char *);
extern int getzfsvfs_impl(struct objset *, struct zfsvfs **);
extern int getzfsvfs(const char *, struct zfsvfs **);

Expand Down
3 changes: 3 additions & 0 deletions include/zfs_comutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *);

extern int zfs_zpl_version_map(int spa_version);
extern int zfs_spa_version_map(int zpl_version);

extern boolean_t zfs_dataset_name_hidden(const char *);

#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41
extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];

Expand Down
27 changes: 27 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,21 @@ kstat.
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
\fBdbuf_metadata_cache_max_bytes\fR (ulong)
.ad
.RS 12n
Maximum size in bytes of the metadata dbuf cache. When \fB0\fR this value will
default to \fB1/2^dbuf_cache_shift\fR (1/16) of the target ARC size, otherwise
the provided value in bytes will be used. The behavior of the metadata dbuf
cache and its associated settings can be observed via the
\fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
Expand Down Expand Up @@ -77,6 +92,18 @@ of the target arc size.
Default value: \fB5\fR.
.RE

.sp
.ne 2
.na
\fBdbuf_metadata_cache_shift\fR (int)
.ad
.RS 12n
Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR,
to a log2 fraction of the target arc size.
.sp
Default value: \fB6\fR.
.RE

.sp
.ne 2
.na
Expand Down
18 changes: 18 additions & 0 deletions module/zcommon/zfs_comutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,28 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
"pool split",
};

boolean_t
zfs_dataset_name_hidden(const char *name)
{
/*
* Skip over datasets that are not visible in this zone,
* internal datasets (which have a $ in their name), and
* temporary datasets (which have a % in their name).
*/
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
return (B_TRUE);
return (B_FALSE);
}

#if defined(_KERNEL)
EXPORT_SYMBOL(zfs_allocatable_devs);
EXPORT_SYMBOL(zpool_get_load_policy);
EXPORT_SYMBOL(zfs_zpl_version_map);
EXPORT_SYMBOL(zfs_spa_version_map);
EXPORT_SYMBOL(zfs_history_event_names);
EXPORT_SYMBOL(zfs_dataset_name_hidden);
#endif

0 comments on commit 2e5dc44

Please sign in to comment.