Skip to content

Commit

Permalink
Implementation of block cloning for ZFS.
Browse files Browse the repository at this point in the history
Block Cloning allows to manually clone a file (or a subset of its
blocks) into another (or the same) file by just creating additional
references to the data blocks without copying the data itself.
Those references are kept in the Block Reference Tables (BRTs).

The whole design of block cloning is documented in module/zfs/brt.c.

Signed-off-by: Pawel Jakub Dawidek <pawel@dawidek.net>
  • Loading branch information
pjd committed Oct 25, 2022
1 parent 0b2428d commit 184c9e7
Show file tree
Hide file tree
Showing 44 changed files with 3,234 additions and 86 deletions.
19 changes: 19 additions & 0 deletions cmd/zdb/zdb_il.c
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,23 @@ zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg)
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
}

static void
zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
{
(void) zilog, (void) txtype;
const lr_clone_range_t *lr = arg;

(void) printf("%sfoid %llu, offset %llx, length %llx, blksize=%llx\n",
tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);

for (unsigned int i = 0; i < lr->lr_nbps; i++) {
(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
(u_longlong_t)lr->lr_nbps);
print_log_bp(&lr->lr_bps[i], "");
}
}

typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
typedef struct zil_rec_info {
zil_prt_rec_func_t zri_print;
Expand Down Expand Up @@ -330,6 +347,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
{.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "},
{.zri_print = zil_prt_rec_setsaxattr,
.zri_name = "TX_SETSAXATTR "},
{.zri_print = zil_prt_rec_clone_range,
.zri_name = "TX_CLONE_RANGE "},
};

static int
Expand Down
2 changes: 1 addition & 1 deletion cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1901,7 +1901,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
if (zil_replaying(zd->zd_zilog, tx))
return;

if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t)))
write_state = WR_INDIRECT;

itx = zil_itx_create(TX_WRITE,
Expand Down
1 change: 1 addition & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ COMMON_H = \
sys/bptree.h \
sys/bqueue.h \
sys/btree.h \
sys/brt.h \
sys/dataset_kstats.h \
sys/dbuf.h \
sys/ddt.h \
Expand Down
5 changes: 4 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ typedef struct zfs_soft_state {

#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp))
#define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync)
#define zn_rlimit_fsize(zp, uio) \
#define zn_rlimit_fsize(size) zfs_rlimit_fsize(size)
#define zn_rlimit_fsize_uio(zp, uio) \
vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))

/* Called on entry to each ZFS vnode and vfs operation */
Expand Down Expand Up @@ -178,6 +179,8 @@ extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
char *buf);

extern int zfs_rlimit_fsize(off_t fsize);
#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions include/os/linux/kernel/linux/mod_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef const struct kernel_param zfs_kernel_param_t;
enum scope_prefix_types {
zfs,
zfs_arc,
zfs_brt,
zfs_condense,
zfs_dbuf,
zfs_dbuf_cache,
Expand Down
3 changes: 2 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ extern "C" {

#define zn_has_cached_data(zp) ((zp)->z_is_mapped)
#define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync)
#define zn_rlimit_fsize(zp, uio) (0)
#define zn_rlimit_fsize(size) (0)
#define zn_rlimit_fsize_uio(zp, uio) (0)

/*
* zhold() wraps igrab() on Linux, and igrab() may fail when the
Expand Down
62 changes: 62 additions & 0 deletions include/sys/brt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
*/

#ifndef _SYS_BRT_H
#define _SYS_BRT_H

#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/dmu.h>

#ifdef __cplusplus
extern "C" {
#endif

extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);

extern uint64_t brt_get_dspace(spa_t *spa);
extern uint64_t brt_get_used(spa_t *spa);
extern uint64_t brt_get_logical_used(spa_t *spa);
extern uint64_t brt_get_ratio(spa_t *spa);

extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp);
extern void brt_init(void);
extern void brt_fini(void);

extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
extern void brt_pending_apply(spa_t *spa, uint64_t txg);

extern void brt_create(spa_t *spa);
extern int brt_load(spa_t *spa);
extern void brt_unload(spa_t *spa);
extern void brt_sync(spa_t *spa, uint64_t txg);

#ifdef __cplusplus
}
#endif

#endif /* _SYS_BRT_H */
1 change: 1 addition & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ typedef struct dbuf_dirty_record {
override_states_t dr_override_state;
uint8_t dr_copies;
boolean_t dr_nopwrite;
boolean_t dr_brtwrite;
boolean_t dr_has_raw_params;

/*
Expand Down
2 changes: 2 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx);

extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

extern const ddt_ops_t ddt_zap_ops;

#ifdef __cplusplus
Expand Down
5 changes: 5 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,11 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
uint64_t *off);

int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp);
void dmu_brt_addref(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);

/*
* Initial setup and final teardown.
*/
Expand Down
3 changes: 3 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ typedef enum {
ZPOOL_PROP_LOAD_GUID,
ZPOOL_PROP_AUTOTRIM,
ZPOOL_PROP_COMPATIBILITY,
ZPOOL_PROP_BRTUSED,
ZPOOL_PROP_BRTLOGICALUSED,
ZPOOL_PROP_BRTRATIO,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
Expand Down
4 changes: 4 additions & 0 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_holey(znode_t *, ulong_t, loff_t *);
extern int zfs_access(znode_t *, int, int, cred_t *);
extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
uint64_t *, cred_t *);
extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
const blkptr_t *, size_t);

extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
Expand Down
3 changes: 3 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz,
const blkptr_t *bps, size_t nbps);
extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
Expand Down
25 changes: 19 additions & 6 deletions include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ typedef enum zil_create {
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_SETSAXATTR 21 /* Set sa xattrs on file */
#define TX_MAX_TYPE 22 /* Max transaction type */
#define TX_CLONE_RANGE 22 /* Clone a file range */
#define TX_MAX_TYPE 23 /* Max transaction type */

/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
Expand All @@ -174,9 +175,9 @@ typedef enum zil_create {
#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */

/*
* Transactions for write, truncate, setattr, acl_v0, and acl can be logged
* out of order. For convenience in the code, all such records must have
* lr_foid at the same offset.
* Transactions for operations below can be logged out of order.
* For convenience in the code, all such records must have lr_foid
* at the same offset.
*/
#define TX_OOO(txtype) \
((txtype) == TX_WRITE || \
Expand All @@ -185,7 +186,8 @@ typedef enum zil_create {
(txtype) == TX_ACL_V0 || \
(txtype) == TX_ACL || \
(txtype) == TX_WRITE2 || \
(txtype) == TX_SETSAXATTR)
(txtype) == TX_SETSAXATTR || \
(txtype) == TX_CLONE_RANGE)

/*
* The number of dnode slots consumed by the object is stored in the 8
Expand Down Expand Up @@ -372,6 +374,17 @@ typedef struct {
/* lr_acl_bytes number of variable sized ace's follows */
} lr_acl_t;

typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* file object to clone into */
uint64_t lr_offset; /* offset to clone to */
uint64_t lr_length; /* length of the blocks to clone */
uint64_t lr_blksz; /* file's block size */
uint64_t lr_nbps; /* number of block pointers */
blkptr_t lr_bps[];
/* block pointers of the blocks to clone follows */
} lr_clone_range_t;

/*
* ZIL structure definitions, interface function prototype and globals.
*/
Expand Down Expand Up @@ -559,7 +572,7 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);

extern uint64_t zil_max_copied_data(zilog_t *zilog);
extern uint64_t zil_max_log_data(zilog_t *zilog);
extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize);

extern void zil_sums_init(zil_sums_t *zs);
extern void zil_sums_fini(zil_sums_t *zs);
Expand Down
3 changes: 2 additions & 1 deletion include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ typedef struct zio_prop {
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
boolean_t zp_brtwrite;
boolean_t zp_encrypt;
boolean_t zp_byteorder;
uint8_t zp_salt[ZIO_DATA_SALT_LEN];
Expand Down Expand Up @@ -552,7 +553,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);

extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite);
boolean_t nopwrite, boolean_t brtwrite);

extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);

Expand Down
35 changes: 19 additions & 16 deletions include/sys/zio_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,28 +127,30 @@ enum zio_stage {

ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */

ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */
ZIO_STAGE_BRT_FREE = 1 << 9, /* --F-- */

ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */
ZIO_STAGE_DDT_READ_START = 1 << 10, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 13, /* --F-- */

ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC- */

ZIO_STAGE_READY = 1 << 19, /* RWFCI */
ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W--- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 18, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C- */

ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */
ZIO_STAGE_READY = 1 << 20, /* RWFCI */

ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */
ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--I */

ZIO_STAGE_DONE = 1 << 24 /* RWFCI */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R---- */

ZIO_STAGE_DONE = 1 << 25 /* RWFCI */
};

#define ZIO_INTERLOCK_STAGES \
Expand Down Expand Up @@ -233,6 +235,7 @@ enum zio_stage {
#define ZIO_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
ZIO_STAGE_BRT_FREE | \
ZIO_STAGE_DVA_FREE)

#define ZIO_DDT_FREE_PIPELINE \
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ typedef enum spa_feature {
SPA_FEATURE_ZILSAXATTR,
SPA_FEATURE_HEAD_ERRLOG,
SPA_FEATURE_BLAKE3,
SPA_FEATURE_BLOCK_CLONING,
SPA_FEATURES
} spa_feature_t;

Expand Down
3 changes: 3 additions & 0 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
case ZPOOL_PROP_ASHIFT:
case ZPOOL_PROP_MAXBLOCKSIZE:
case ZPOOL_PROP_MAXDNODESIZE:
case ZPOOL_PROP_BRTUSED:
case ZPOOL_PROP_BRTLOGICALUSED:
if (literal)
(void) snprintf(buf, len, "%llu",
(u_longlong_t)intval);
Expand Down Expand Up @@ -380,6 +382,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
}
break;

case ZPOOL_PROP_BRTRATIO:
case ZPOOL_PROP_DEDUPRATIO:
if (literal)
(void) snprintf(buf, len, "%llu.%02llu",
Expand Down
1 change: 1 addition & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/bptree.c \
module/zfs/bqueue.c \
module/zfs/btree.c \
module/zfs/brt.c \
module/zfs/dbuf.c \
module/zfs/dbuf_stats.c \
module/zfs/ddt.c \
Expand Down
2 changes: 2 additions & 0 deletions module/Kbuild.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel
ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
ZFS_MODULE_CFLAGS += -I$(zfs_include)
ZFS_MODULE_CFLAGS += -I$(icp_include)
ZFS_MODULE_CPPFLAGS += -D_KERNEL
ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@

Expand Down Expand Up @@ -296,6 +297,7 @@ ZFS_OBJS := \
bpobj.o \
bptree.o \
bqueue.o \
brt.o \
btree.o \
dataset_kstats.o \
dbuf.o \
Expand Down
Loading

0 comments on commit 184c9e7

Please sign in to comment.