@@ -244,6 +244,7 @@ typedef enum {
ZPOOL_PROP_MULTIHOST,
ZPOOL_PROP_CHECKPOINT,
ZPOOL_PROP_LOAD_GUID,
ZPOOL_PROP_AUTOTRIM,
ZPOOL_NUM_PROPS
} zpool_prop_t;

@@ -635,13 +636,15 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue"

/* Queue sizes */
#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue"

/* Latency read/write histogram stats */
#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
@@ -653,18 +656,21 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo"

/* Request size histograms */
#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"

/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
@@ -777,13 +783,28 @@ typedef struct zpool_load_policy {
#define VDEV_ALLOC_BIAS_SPECIAL "special"
#define VDEV_ALLOC_BIAS_DEDUP "dedup"

/* vdev initialize state */
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
"com.delphix:next_offset_to_initialize"
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
"com.delphix:vdev_initialize_state"
#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
"com.delphix:vdev_initialize_action_time"

/* vdev TRIM state */
#define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \
"org.zfsonlinux:next_offset_to_trim"
#define VDEV_LEAF_ZAP_TRIM_STATE \
"org.zfsonlinux:vdev_trim_state"
#define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \
"org.zfsonlinux:vdev_trim_action_time"
#define VDEV_LEAF_ZAP_TRIM_RATE \
"org.zfsonlinux:vdev_trim_rate"
#define VDEV_LEAF_ZAP_TRIM_PARTIAL \
"org.zfsonlinux:vdev_trim_partial"
#define VDEV_LEAF_ZAP_TRIM_SECURE \
"org.zfsonlinux:vdev_trim_secure"

/*
* This is needed in userland to report the minimum necessary device size.
*/
@@ -915,6 +936,7 @@ typedef enum zio_type {
ZIO_TYPE_FREE,
ZIO_TYPE_CLAIM,
ZIO_TYPE_IOCTL,
ZIO_TYPE_TRIM,
ZIO_TYPES
} zio_type_t;

@@ -982,8 +1004,14 @@ typedef enum zpool_errata {

/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
* is passed between kernel and user land as an nvlist uint64 array.
*
* The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
* order to keep subsequent members at their known fixed offsets. When
* adding a new field it must be added to the end the structure.
*/
#define VS_ZIO_TYPES 6

typedef struct vdev_stat {
hrtime_t vs_timestamp; /* time since vdev load */
uint64_t vs_state; /* vdev state */
@@ -993,8 +1021,8 @@ typedef struct vdev_stat {
uint64_t vs_dspace; /* deflated capacity */
uint64_t vs_rsize; /* replaceable dev size */
uint64_t vs_esize; /* expandable dev size */
uint64_t vs_ops[ZIO_TYPES]; /* operation count */
uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */
uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
@@ -1010,6 +1038,12 @@ typedef struct vdev_stat {
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
uint64_t vs_trim_errors; /* trimming errors */
uint64_t vs_trim_notsup; /* supported by device */
uint64_t vs_trim_bytes_done; /* bytes trimmed */
uint64_t vs_trim_bytes_est; /* total bytes to trim */
uint64_t vs_trim_state; /* vdev_trim_state_t */
uint64_t vs_trim_action_time; /* time_t */
} vdev_stat_t;

/*
@@ -1068,12 +1102,22 @@ typedef struct vdev_stat_ex {
* Initialize functions.
*/
typedef enum pool_initialize_func {
POOL_INITIALIZE_DO,
POOL_INITIALIZE_START,
POOL_INITIALIZE_CANCEL,
POOL_INITIALIZE_SUSPEND,
POOL_INITIALIZE_FUNCS
} pool_initialize_func_t;

/*
* TRIM functions.
*/
typedef enum pool_trim_func {
POOL_TRIM_START,
POOL_TRIM_CANCEL,
POOL_TRIM_SUSPEND,
POOL_TRIM_FUNCS
} pool_trim_func_t;

/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
@@ -1126,6 +1170,14 @@ typedef enum {
VDEV_INITIALIZE_COMPLETE
} vdev_initializing_state_t;

typedef enum {
VDEV_TRIM_NONE,
VDEV_TRIM_ACTIVE,
VDEV_TRIM_CANCELED,
VDEV_TRIM_SUSPENDED,
VDEV_TRIM_COMPLETE,
} vdev_trim_state_t;

/*
* nvlist name constants. Facilitate restricting snapshot iteration range for
* the "list next snapshot" ioctl
@@ -1224,6 +1276,7 @@ typedef enum zfs_ioc {
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
ZFS_IOC_POOL_TRIM, /* 0x5a50 */

/*
* Linux - 3/64 numbers reserved.
@@ -1326,6 +1379,14 @@ typedef enum {
#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"

/*
* The following are names used when invoking ZFS_IOC_POOL_TRIM.
*/
#define ZPOOL_TRIM_COMMAND "trim_command"
#define ZPOOL_TRIM_VDEVS "trim_vdevs"
#define ZPOOL_TRIM_RATE "trim_rate"
#define ZPOOL_TRIM_SECURE "trim_secure"

/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
@@ -120,6 +120,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *);
void metaslab_enable(metaslab_t *, boolean_t);

#ifdef __cplusplus
}
@@ -69,7 +69,7 @@ typedef enum trace_alloc_type {
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
TRACE_VDEV_ERROR = -8ULL,
TRACE_INITIALIZING = -9ULL
TRACE_DISABLED = -9ULL,
} trace_alloc_type_t;

#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
@@ -272,10 +272,10 @@ struct metaslab_group {
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];

int mg_ms_initializing;
boolean_t mg_initialize_updating;
kmutex_t mg_ms_initialize_lock;
kcondvar_t mg_ms_initialize_cv;
int mg_ms_disabled;
boolean_t mg_disabled_updating;
kmutex_t mg_ms_disabled_lock;
kcondvar_t mg_ms_disabled_cv;
};

/*
@@ -389,11 +389,24 @@ struct metaslab {
range_tree_t *ms_defer[TXG_DEFER_SIZE];
range_tree_t *ms_checkpointing; /* to add to the checkpoint */

/*
* The ms_trim tree is the set of allocatable segments which are
* eligible for trimming. (When the metaslab is loaded, it's a
* subset of ms_allocatable.) It's kept in-core as long as the
* autotrim property is set and is not vacated when the metaslab
* is unloaded. Its purpose is to aggregate freed ranges to
* facilitate efficient trimming.
*/
range_tree_t *ms_trim;

boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;

uint64_t ms_initializing; /* leaves initializing this ms */
/*
* The number of consumers which have disabled the metaslab.
*/
uint64_t ms_disabled;

/*
* We must always hold the ms_lock when modifying ms_loaded
@@ -738,6 +738,24 @@ typedef enum spa_import_type {
SPA_IMPORT_ASSEMBLE
} spa_import_type_t;

/*
* Send TRIM commands in-line during normal pool operation while deleting.
* OFF: no
* ON: yes
*/
typedef enum {
SPA_AUTOTRIM_OFF = 0, /* default */
SPA_AUTOTRIM_ON
} spa_autotrim_t;

/*
* Reason TRIM command was issued, used internally for accounting purposes.
*/
typedef enum trim_type {
TRIM_TYPE_MANUAL = 0,
TRIM_TYPE_AUTO = 1,
} trim_type_t;

/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
@@ -764,15 +782,17 @@ extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400

/*
* Controls the behavior of spa_vdev_remove().
@@ -790,6 +810,8 @@ extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
nvlist_t *vdev_errlist);
extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -887,6 +909,7 @@ typedef struct spa_stats {
spa_history_kstat_t io_history;
spa_history_list_t mmp_history;
spa_history_kstat_t state; /* pool state */
spa_history_kstat_t iostats;
} spa_stats_t;

typedef enum txg_state {
@@ -905,6 +928,22 @@ typedef struct txg_stat {
uint64_t ndirty;
} txg_stat_t;

/* Assorted pool IO kstats */
typedef struct spa_iostats {
kstat_named_t trim_extents_written;
kstat_named_t trim_bytes_written;
kstat_named_t trim_extents_skipped;
kstat_named_t trim_bytes_skipped;
kstat_named_t trim_extents_failed;
kstat_named_t trim_bytes_failed;
kstat_named_t autotrim_extents_written;
kstat_named_t autotrim_bytes_written;
kstat_named_t autotrim_extents_skipped;
kstat_named_t autotrim_bytes_skipped;
kstat_named_t autotrim_extents_failed;
kstat_named_t autotrim_bytes_failed;
} spa_iostats_t;

extern void spa_stats_init(spa_t *spa);
extern void spa_stats_destroy(spa_t *spa);
extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
@@ -922,6 +961,10 @@ extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
int error);
extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_written, uint64_t bytes_written,
uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed);

/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
@@ -1005,6 +1048,7 @@ extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_deadman_ziotime(spa_t *spa);
extern uint64_t spa_dirty_data(spa_t *spa);
extern spa_autotrim_t spa_get_autotrim(spa_t *spa);

/* Miscellaneous support routines */
extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
@@ -378,6 +378,7 @@ struct spa {
uint64_t spa_deadman_ziotime; /* deadman zio expiration */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
uint64_t spa_autotrim; /* automatic background trim? */
uint64_t spa_errata; /* errata issues detected */
spa_stats_t spa_stats; /* assorted spa statistics */
spa_keystore_t spa_keystore; /* loaded crypto keys */
@@ -118,6 +118,11 @@ extern "C" {
#define ESC_ZFS_BOOTFS_VDEV_ATTACH "bootfs_vdev_attach"
#define ESC_ZFS_POOL_REGUID "pool_reguid"
#define ESC_ZFS_HISTORY_EVENT "history_event"
#define ESC_ZFS_TRIM_START "trim_start"
#define ESC_ZFS_TRIM_FINISH "trim_finish"
#define ESC_ZFS_TRIM_CANCEL "trim_cancel"
#define ESC_ZFS_TRIM_RESUME "trim_resume"
#define ESC_ZFS_TRIM_SUSPEND "trim_suspend"

/*
* datalink subclass definitions.
@@ -90,10 +90,11 @@ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
/*
* Wait until the given transaction group, or one after it, is
* the open transaction group. Try to make this happen as soon
* as possible (eg. kick off any necessary syncs immediately).
* If txg == 0, wait for the next open txg.
* as possible (eg. kick off any necessary syncs immediately) when
* should_quiesce is set. If txg == 0, wait for the next open txg.
*/
extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg,
boolean_t should_quiesce);

/*
* Returns TRUE if we are "backed up" waiting for the syncing
@@ -95,6 +95,8 @@ extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
range_seg_t *physical_rs);

extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
@@ -145,6 +145,7 @@ struct vdev_queue {
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
@@ -260,6 +261,7 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */

/* Initialize related */
boolean_t vdev_initialize_exit_wanted;
vdev_initializing_state_t vdev_initialize_state;
list_node_t vdev_initialize_node;
@@ -274,10 +276,34 @@ struct vdev {
uint64_t vdev_initialize_bytes_done;
time_t vdev_initialize_action_time; /* start and end time */

/* for limiting outstanding I/Os */
/* TRIM related */
boolean_t vdev_trim_exit_wanted;
boolean_t vdev_autotrim_exit_wanted;
vdev_trim_state_t vdev_trim_state;
list_node_t vdev_trim_node;
kmutex_t vdev_autotrim_lock;
kcondvar_t vdev_autotrim_cv;
kthread_t *vdev_autotrim_thread;
/* Protects vdev_trim_thread and vdev_trim_state. */
kmutex_t vdev_trim_lock;
kcondvar_t vdev_trim_cv;
kthread_t *vdev_trim_thread;
uint64_t vdev_trim_offset[TXG_SIZE];
uint64_t vdev_trim_last_offset;
uint64_t vdev_trim_bytes_est;
uint64_t vdev_trim_bytes_done;
uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */
uint64_t vdev_trim_partial; /* requested partial TRIM */
uint64_t vdev_trim_secure; /* requested secure TRIM */
time_t vdev_trim_action_time; /* start and end time */

/* for limiting outstanding I/Os (initialize and TRIM) */
kmutex_t vdev_initialize_io_lock;
kcondvar_t vdev_initialize_io_cv;
uint64_t vdev_initialize_inflight;
kmutex_t vdev_trim_io_lock;
kcondvar_t vdev_trim_io_cv;
uint64_t vdev_trim_inflight[2];

/*
* Values stored in the config for an indirect or removing vdev.
@@ -343,6 +369,8 @@ struct vdev {
uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_has_trim; /* TRIM is supported */
boolean_t vdev_has_securetrim; /* secure TRIM is supported */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
@@ -39,8 +39,6 @@ extern void vdev_initialize_stop_all(vdev_t *vd,
vdev_initializing_state_t tgt_state);
extern void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list);
extern void vdev_initialize_restart(vdev_t *vd);
extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
range_seg_t *physical_rs);

#ifdef __cplusplus
}
@@ -0,0 +1,52 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2019 Lawrence Livermore National Security, LLC.
*/

#ifndef _SYS_VDEV_TRIM_H
#define _SYS_VDEV_TRIM_H

#include <sys/spa.h>

#ifdef __cplusplus
extern "C" {
#endif

extern unsigned int zfs_trim_metaslab_skip;

extern void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial,
boolean_t secure);
extern void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt, list_t *vd_list);
extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state);
extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list);
extern void vdev_trim_restart(vdev_t *vd);
extern void vdev_autotrim(spa_t *spa);
extern void vdev_autotrim_stop_all(spa_t *spa);
extern void vdev_autotrim_stop_wait(vdev_t *vd);
extern void vdev_autotrim_restart(spa_t *spa);

#ifdef __cplusplus
}
#endif

#endif /* _SYS_VDEV_TRIM_H */
@@ -579,6 +579,8 @@ typedef struct vsecattr {

#define CRCREAT 0

#define F_FREESP 11

extern int fop_getattr(vnode_t *vp, vattr_t *vap);

#define VOP_CLOSE(vp, f, c, o, cr, ct) vn_close(vp)
@@ -587,6 +589,16 @@ extern int fop_getattr(vnode_t *vp, vattr_t *vap);

#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd)

#if defined(HAVE_FILE_FALLOCATE) && \
defined(FALLOC_FL_PUNCH_HOLE) && \
defined(FALLOC_FL_KEEP_SIZE)
#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) \
fallocate((vp)->v_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, \
(flck)->l_start, (flck)->l_len)
#else
#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) (0)
#endif

#define VN_RELE(vp) vn_close(vp)

extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
@@ -54,6 +54,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
#define ZFS_DEBUG_SET_ERROR (1 << 9)
#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
#define ZFS_DEBUG_TRIM (1 << 11)

extern void __zfs_dbgmsg(char *buf);
extern void __dprintf(boolean_t dprint, const char *file, const char *func,
@@ -416,6 +416,14 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio);
#define ZIO_REEXECUTE_NOW 0x01
#define ZIO_REEXECUTE_SUSPEND 0x02

/*
* The io_trim flags are used to specify the type of TRIM to perform. They
* only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
*/
enum trim_flag {
ZIO_TRIM_SECURE = 1 << 0,
};

typedef struct zio_alloc_list {
list_t zal_list;
uint64_t zal_size;
@@ -434,6 +442,7 @@ struct zio {
zio_prop_t io_prop;
zio_type_t io_type;
enum zio_child io_child_type;
enum trim_flag io_trim_flags;
int io_cmd;
zio_priority_t io_priority;
uint8_t io_reexecute;
@@ -549,6 +558,10 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, enum zio_flag flags);

extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, enum trim_flag trim_flags);

extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
@@ -250,6 +250,11 @@ enum zio_stage {
ZIO_STAGE_VDEV_IO_START | \
ZIO_STAGE_VDEV_IO_ASSESS)

#define ZIO_TRIM_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_ISSUE_ASYNC | \
ZIO_VDEV_IO_STAGES)

#define ZIO_BLOCKING_STAGES \
(ZIO_STAGE_DVA_ALLOCATE | \
ZIO_STAGE_DVA_CLAIM | \
@@ -30,6 +30,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_TRIM, /* trim I/O (discard) */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;
@@ -2092,6 +2092,57 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
return (ret);
}

/*
* Translate vdev names to guids. If a vdev_path is determined to be
* unsuitable then a vd_errlist is allocated and the vdev path and errno
* are added to it.
*/
static int
zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds,
nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist)
{
nvlist_t *errlist = NULL;
int error = 0;

for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
elem = nvlist_next_nvpair(vds, elem)) {
boolean_t spare, cache;

char *vd_path = nvpair_name(elem);
nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache,
NULL);

if ((tgt == NULL) || cache || spare) {
if (errlist == NULL) {
errlist = fnvlist_alloc();
error = EINVAL;
}

uint64_t err = (tgt == NULL) ? EZFS_NODEVICE :
(spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
fnvlist_add_int64(errlist, vd_path, err);
continue;
}

uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
fnvlist_add_uint64(vdev_guids, vd_path, guid);

char msg[MAXNAMELEN];
(void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
fnvlist_add_string(guids_to_paths, msg, vd_path);
}

if (error != 0) {
verify(errlist != NULL);
if (vd_errlist != NULL)
*vd_errlist = errlist;
else
fnvlist_free(errlist);
}

return (error);
}

static int
xlate_init_err(int err)
{
@@ -2118,72 +2169,152 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
{
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;

nvlist_t *errlist;
int err;

/* translate vdev names to guids */
nvlist_t *vdev_guids = fnvlist_alloc();
nvlist_t *guids_to_paths = fnvlist_alloc();
boolean_t spare, cache;
nvlist_t *tgt;
nvlist_t *vd_errlist = NULL;
nvlist_t *errlist;
nvpair_t *elem;

for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
elem = nvlist_next_nvpair(vds, elem)) {
char *vd_path = nvpair_name(elem);
tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL);
err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
guids_to_paths, &vd_errlist);

if ((tgt == NULL) || cache || spare) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot initialize '%s'"),
vd_path);
int err = (tgt == NULL) ? EZFS_NODEVICE :
(spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
if (err == 0) {
err = lzc_initialize(zhp->zpool_name, cmd_type,
vdev_guids, &errlist);
if (err == 0) {
fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);
return (zfs_error(hdl, err, msg));
return (0);
}

uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
fnvlist_add_uint64(vdev_guids, vd_path, guid);
if (errlist != NULL) {
vd_errlist = fnvlist_lookup_nvlist(errlist,
ZPOOL_INITIALIZE_VDEVS);
}

(void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
fnvlist_add_string(guids_to_paths, msg, vd_path);
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "operation failed"));
} else {
verify(vd_errlist != NULL);
}

for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
elem = nvlist_next_nvpair(vd_errlist, elem)) {
int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
char *path;

if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
&path) != 0)
path = nvpair_name(elem);

(void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
"cannot initialize '%s'", path);
}

int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids,
&errlist);
fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);

if (err == 0) {
fnvlist_free(guids_to_paths);
return (0);
if (vd_errlist != NULL) {
fnvlist_free(vd_errlist);
return (-1);
}

return (zpool_standard_error(zhp->zpool_hdl, err, msg));
}

static int
xlate_trim_err(int err)
{
switch (err) {
case ENODEV:
return (EZFS_NODEVICE);
case EINVAL:
case EROFS:
return (EZFS_BADDEV);
case EBUSY:
return (EZFS_TRIMMING);
case ESRCH:
return (EZFS_NO_TRIM);
case EOPNOTSUPP:
return (EZFS_TRIM_NOTSUP);
}
return (err);
}

/*
* Begin, suspend, or cancel the TRIM (discarding of all free blocks) for
* the given vdevs in the given pool.
*/
int
zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds,
trimflags_t *trim_flags)
{
char msg[1024];
int err;

nvlist_t *vdev_guids = fnvlist_alloc();
nvlist_t *guids_to_paths = fnvlist_alloc();
nvlist_t *vd_errlist = NULL;
if (errlist != NULL) {
vd_errlist = fnvlist_lookup_nvlist(errlist,
ZPOOL_INITIALIZE_VDEVS);
nvlist_t *errlist;
nvpair_t *elem;

err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
guids_to_paths, &vd_errlist);
if (err == 0) {
err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate,
trim_flags->secure, vdev_guids, &errlist);
if (err == 0) {
fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);
return (0);
}

if (errlist != NULL) {
vd_errlist = fnvlist_lookup_nvlist(errlist,
ZPOOL_TRIM_VDEVS);
}

(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "operation failed"));
} else {
verify(vd_errlist != NULL);
}

(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "operation failed"));
for (elem = nvlist_next_nvpair(vd_errlist, NULL);
elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) {
int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem));
char *path;

for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
elem = nvlist_next_nvpair(vd_errlist, elem)) {
int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
char *path = fnvlist_lookup_string(guids_to_paths,
nvpair_name(elem));
(void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'",
path);
/*
* If only the pool was specified, and it was not a secure
* trim then suppress warnings for individual vdevs which
* do not support trimming.
*/
if (vd_error == EZFS_TRIM_NOTSUP &&
trim_flags->fullpool &&
!trim_flags->secure) {
continue;
}

if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
&path) != 0)
path = nvpair_name(elem);

(void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
"cannot trim '%s'", path);
}

fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);
if (vd_errlist != NULL)

if (vd_errlist != NULL) {
fnvlist_free(vd_errlist);
return (-1);
}

return (zpool_standard_error(hdl, err, msg));
return (zpool_standard_error(zhp->zpool_hdl, err, msg));
}

/*
@@ -292,6 +292,13 @@ libzfs_error_description(libzfs_handle_t *hdl)
"initialization"));
case EZFS_WRONG_PARENT:
return (dgettext(TEXT_DOMAIN, "invalid parent dataset"));
case EZFS_TRIMMING:
return (dgettext(TEXT_DOMAIN, "currently trimming"));
case EZFS_NO_TRIM:
return (dgettext(TEXT_DOMAIN, "there is no active trim"));
case EZFS_TRIM_NOTSUP:
return (dgettext(TEXT_DOMAIN, "trim operations are not "
"supported by this device"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@@ -1412,7 +1412,8 @@ lzc_reopen(const char *pool_name, boolean_t scrub_restart)
* - ENODEV if the device was not found
* - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
* - EROFS if the device is not writeable
* - EBUSY start requested but the device is already being initialized
* - EBUSY start requested but the device is already being either
* initialized or trimmed
* - ESRCH cancel/suspend requested but device is not being initialized
*
* If the errlist is empty, then return value will be:
@@ -1425,6 +1426,7 @@ lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
nvlist_t *vdevs, nvlist_t **errlist)
{
int error;

nvlist_t *args = fnvlist_alloc();
fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
@@ -1435,3 +1437,45 @@ lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,

return (error);
}

/*
* Changes TRIM state.
*
* vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
* The key is ignored.
*
* If there are errors related to vdev arguments, per-vdev errors are returned
* in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
* guid is stringified with PRIu64, and errno is one of the following as
* an int64_t:
* - ENODEV if the device was not found
* - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
* - EROFS if the device is not writeable
* - EBUSY start requested but the device is already being either trimmed
* or initialized
* - ESRCH cancel/suspend requested but device is not being initialized
* - EOPNOTSUPP if the device does not support TRIM (or secure TRIM)
*
* If the errlist is empty, then return value will be:
* - EINVAL if one or more arguments was invalid
* - Other spa_open failures
* - 0 if the operation succeeded
*/
int
lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate,
boolean_t secure, nvlist_t *vdevs, nvlist_t **errlist)
{
int error;

nvlist_t *args = fnvlist_alloc();
fnvlist_add_uint64(args, ZPOOL_TRIM_COMMAND, (uint64_t)cmd_type);
fnvlist_add_nvlist(args, ZPOOL_TRIM_VDEVS, vdevs);
fnvlist_add_uint64(args, ZPOOL_TRIM_RATE, rate);
fnvlist_add_boolean_value(args, ZPOOL_TRIM_SECURE, secure);

error = lzc_ioctl(ZFS_IOC_POOL_TRIM, poolname, args, errlist);

fnvlist_free(args);

return (error);
}
@@ -130,6 +130,7 @@ KERNEL_C = \
vdev_raidz_math_ssse3.c \
vdev_removal.c \
vdev_root.c \
vdev_trim.c \
zap.c \
zap_leaf.c \
zap_micro.c \
@@ -14,7 +14,7 @@
.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.TH ZFS-MODULE-PARAMETERS 5 "Feb 8, 2019"
.TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019"
.SH NAME
zfs\-module\-parameters \- ZFS module parameters
.SH DESCRIPTION
@@ -1532,6 +1532,30 @@ See the section "ZFS I/O SCHEDULER".
Default value: \fB10\fR.
.RE

.sp
.ne 2
.na
\fBzfs_vdev_trim_max_active\fR (int)
.ad
.RS 12n
Maximum trim/discard I/Os active to each device.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB2\fR.
.RE

.sp
.ne 2
.na
\fBzfs_vdev_trim_min_active\fR (int)
.ad
.RS 12n
Minimum trim/discard I/Os active to each device.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB1\fR.
.RE

.sp
.ne 2
.na
@@ -1619,6 +1643,12 @@ _
_
512 ZFS_DEBUG_SET_ERROR
Enable SET_ERROR and dprintf entries in the debug log.
_
1024 ZFS_DEBUG_INDIRECT_REMAP
Verify split blocks created by device removal.
_
2048 ZFS_DEBUG_TRIM
Verify TRIM ranges are always within the allocatable range tree.
.TE
.sp
* Requires debug build.
@@ -2341,6 +2371,82 @@ value of 75% will create a maximum of one thread per cpu.
Default value: \fB75\fR%.
.RE

.sp
.ne 2
.na
\fBzfs_trim_extent_bytes_max\fR (unsigned int)
.ad
.RS 12n
Maximum size of TRIM command. Ranges larger than this will be split in to
chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being
issued to the device.
.sp
Default value: \fB134,217,728\fR.
.RE

.sp
.ne 2
.na
\fBzfs_trim_extent_bytes_min\fR (unsigned int)
.ad
.RS 12n
Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped
unless they're part of a larger range which was broken in to chunks. This is
done because it's common for these small TRIMs to negatively impact overall
performance. This value can be set to 0 to TRIM all unallocated space.
.sp
Default value: \fB32,768\fR.
.RE

.sp
.ne 2
.na
\fBzfs_trim_metaslab_skip\fR (unsigned int)
.ad
.RS 12n
Skip uninitialized metaslabs during the TRIM process. This option is useful
for pools constructed from large thinly-provisioned devices where TRIM
operations are slow. As a pool ages an increasing fraction of the pools
metaslabs will be initialized progressively degrading the usefulness of
this option. This setting is stored when starting a manual TRIM and will
persist for the duration of the requested TRIM.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
\fBzfs_trim_queue_limit\fR (unsigned int)
.ad
.RS 12n
Maximum number of queued TRIMs outstanding per leaf vdev. The number of
concurrent TRIM commands issued to the device is controlled by the
\fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module
options.
.sp
Default value: \fB10\fR.
.RE

.sp
.ne 2
.na
\fBzfs_trim_txg_batch\fR (unsigned int)
.ad
.RS 12n
The number of transaction groups worth of frees which should be aggregated
before TRIM operations are issued to the device. This setting represents a
trade-off between issuing larger, more efficient TRIM operations and the
delay before the recently trimmed space is available for use by the device.
.sp
Increasing this value will allow frees to be aggregated for a longer time.
This will result is larger TRIM operations and potentially increased memory
usage. Decreasing this value will have the opposite effect. The default
value of 32 was determined to be a reasonable compromise.
.sp
Default value: \fB32\fR.
.RE

.sp
.ne 2
.na
@@ -2364,6 +2470,19 @@ Flush dirty data to disk at least every N seconds (maximum txg duration)
Default value: \fB5\fR.
.RE

.sp
.ne 2
.na
\fBzfs_vdev_aggregate_trim\fR (int)
.ad
.RS 12n
Allow TRIM I/Os to be aggregated. This is normally not helpful because
the extents to be trimmed will have been already been aggregated by the
metaslab. This option is provided for debugging and performance analysis.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
@@ -174,6 +174,13 @@
.Op Fl s | Fl p
.Ar pool Ns ...
.Nm
.Cm trim
.Op Fl d
.Op Fl r Ar rate
.Op Fl c | Fl s
.Ar pool
.Op Ar device Ns ...
.Nm
.Cm set
.Ar property Ns = Ns Ar value
.Ar pool
@@ -187,7 +194,7 @@
.Nm
.Cm status
.Oo Fl c Ar SCRIPT Oc
.Op Fl DigLpPsvx
.Op Fl DigLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
@@ -806,6 +813,28 @@ Any write requests that have yet to be committed to disk would be blocked.
.It Sy panic
Prints out a message to the console and generates a system crash dump.
.El
.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off
When set to
.Sy on
space which has been recently freed, and is no longer allocated by the pool,
will be periodically trimmed. This allows block device vdevs which support
BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system
supports hole-punching, to reclaim unused blocks. The default setting for
this property is
.Sy off .
.Pp
Automatic TRIM does not immediately reclaim blocks after a free. Instead,
it will optimistically delay allowing smaller ranges to be aggregated in to
a few larger ones. These can then be issued more efficiently to the storage.
.Pp
Be aware that automatic trimming of recently freed data blocks can put
significant stress on the underlying storage devices. This will vary
depending of how well the specific device handles these commands. For
lower end devices it is often possible to achieve most of the benefits
of automatic trimming by running an on-demand (manual) TRIM periodically
using the
.Nm zpool Cm trim
command.
.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled
The value of this property is the current state of
.Ar feature_name .
@@ -1782,15 +1811,10 @@ the path. This can be used in conjunction with the
.Fl L
flag.
.It Fl r
Print request size histograms for the leaf ZIOs. This includes
histograms of individual ZIOs (
.Ar ind )
and aggregate ZIOs (
.Ar agg ).
These stats can be useful for seeing how well the ZFS IO aggregator is
working. Do not confuse these request size stats with the block layer
requests; it's possible ZIOs can be broken up before being sent to the
block device.
Print request size histograms for the leaf vdev's IO. This includes
histograms of individual IOs (ind) and aggregate IOs (agg). These stats
can be useful for observing how well IO aggregation is working. Note
that TRIM IOs may exceed 16M, but will be counted as 16M.
.It Fl v
Verbose statistics Reports usage statistics for individual vdevs within the
pool, in addition to the pool-wide statistics.
@@ -1829,6 +1853,8 @@ Average amount of time IO spent in asynchronous priority queues.
Does not include disk time.
.Ar scrub :
Average queuing time in scrub queue. Does not include disk time.
.Ar trim :
Average queuing time in trim queue. Does not include disk time.
.It Fl q
Include active queue statistics. Each priority queue has both
pending (
@@ -1846,6 +1872,8 @@ queues.
Current number of entries in asynchronous priority queues.
.Ar scrubq_read :
Current number of entries in scrub queue.
.Ar trimq_write :
Current number of entries in trim queue.
.Pp
All queue statistics are instantaneous measurements of the number of
entries in the queues. If you specify an interval, the measurements
@@ -2151,6 +2179,48 @@ restarted from the beginning. Any drives that were scheduled for a deferred
resilver will be added to the new one.
.It Xo
.Nm
.Cm trim
.Op Fl d
.Op Fl c | Fl s
.Ar pool
.Op Ar device Ns ...
.Xc
Initiates an immediate on-demand TRIM operation for all of the free space in
a pool. This operation informs the underlying storage devices of all blocks
in the pool which are no longer allocated and allows thinly provisioned
devices to reclaim the space.
.Pp
A manual on-demand TRIM operation can be initiated irrespective of the
.Sy autotrim
pool property setting. See the documentation for the
.Sy autotrim
property above for the types of vdev devices which can be trimmed.
.Bl -tag -width Ds
.It Fl d -secure
Causes a secure TRIM to be initiated. When performing a secure TRIM, the
device guarantees that data stored on the trimmed blocks has been erased.
This requires support from the device and is not supported by all SSDs.
.It Fl r -rate Ar rate
Controls the rate at which the TRIM operation progresses. Without this
option TRIM is executed as quickly as possible. The rate, expressed in bytes
per second, is applied on a per-vdev basis and may be set differently for
each leaf vdev.
.It Fl c, -cancel
Cancel trimming on the specified devices, or all eligible devices if none
are specified.
If one or more target devices are invalid or are not currently being
trimmed, the command will fail and no cancellation will occur on any device.
.It Fl s -suspend
Suspend trimming on the specified devices, or all eligible devices if none
are specified.
If one or more target devices are invalid or are not currently being
trimmed, the command will fail and no suspension will occur on any device.
Trimming can then be resumed by running
.Nm zpool Cm trim
with no flags on the relevant target devices.
.El
.It Xo
.Nm
.Cm set
.Ar property Ns = Ns Ar value
.Ar pool
@@ -2238,7 +2308,7 @@ and automatically import it.
.Nm
.Cm status
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
.Op Fl DigLpPsvx
.Op Fl DigLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
@@ -2295,6 +2365,8 @@ didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
This does not necessarily mean the IOs failed to complete, just took an
unreasonably long amount of time. This may indicate a problem with the
underlying storage.
.It Fl t
Display vdev TRIM status.
.It Fl T Sy u Ns | Ns Sy d
Display a time stamp.
Specify
@@ -130,6 +130,9 @@ zpool_prop_init(void)
zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
"wait | continue | panic", "FAILMODE", failuremode_table);
zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
"on | off", "AUTOTRIM", boolean_table);

/* hidden properties */
zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
@@ -99,6 +99,7 @@ $(MODULE)-objs += vdev_raidz_math.o
$(MODULE)-objs += vdev_raidz_math_scalar.o
$(MODULE)-objs += vdev_removal.o
$(MODULE)-objs += vdev_root.o
$(MODULE)-objs += vdev_trim.o
$(MODULE)-objs += zap.o
$(MODULE)-objs += zap_leaf.o
$(MODULE)-objs += zap_micro.o
@@ -842,7 +842,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
if (dirty_frees_threshold != 0 &&
long_free_dirty_all_txgs >= dirty_frees_threshold) {
DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
txg_wait_open(dp, 0);
txg_wait_open(dp, 0, B_TRUE);
continue;
}

@@ -181,7 +181,6 @@ int metaslab_lba_weighting_enabled = B_TRUE;
*/
int metaslab_bias_enabled = B_TRUE;


/*
* Enable/disable remapping of indirect DVAs to their concrete vdevs.
*/
@@ -219,6 +218,12 @@ boolean_t metaslab_trace_enabled = B_TRUE;
uint64_t metaslab_trace_max_entries = 5000;
#endif

/*
* Maximum number of metaslabs per group that can be disabled
* simultaneously.
*/
int max_disabled_ms = 3;

static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -652,8 +657,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)

mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -700,8 +705,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
kmem_free(mg->mg_secondaries, mg->mg_allocators *
sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
mutex_destroy(&mg->mg_ms_initialize_lock);
cv_destroy(&mg->mg_ms_initialize_cv);
mutex_destroy(&mg->mg_ms_disabled_lock);
cv_destroy(&mg->mg_ms_disabled_cv);

for (int i = 0; i < mg->mg_allocators; i++) {
zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1846,8 +1851,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
*/
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
&ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
metaslab_group_add(mg, ms);

ms->ms_trim = range_tree_create(NULL, NULL);

metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);

/*
@@ -1921,6 +1928,9 @@ metaslab_fini(metaslab_t *msp)
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));

range_tree_vacate(msp->ms_trim, NULL, NULL);
range_tree_destroy(msp->ms_trim);

mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -2727,6 +2737,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT3P(msp->ms_freeing, !=, NULL);
ASSERT3P(msp->ms_freed, !=, NULL);
ASSERT3P(msp->ms_checkpointing, !=, NULL);
ASSERT3P(msp->ms_trim, !=, NULL);

/*
* Normally, we don't want to process a metaslab if there are no
@@ -2999,6 +3010,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
metaslab_load_wait(msp);

/*
* When auto-trimming is enabled, free ranges which are added to
* ms_allocatable are also be added to ms_trim. The ms_trim tree is
* periodically consumed by the vdev_autotrim_thread() which issues
* trims for all ranges and then vacates the tree. The ms_trim tree
* can be discarded at any time with the sole consequence of recent
* frees not being trimmed.
*/
if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
if (!defer_allowed) {
range_tree_walk(msp->ms_freed, range_tree_add,
msp->ms_trim);
}
} else {
range_tree_vacate(msp->ms_trim, NULL, NULL);
}

/*
* Move the frees from the defer_tree back to the free
* range tree (if it's loaded). Swap the freed_tree and
@@ -3047,7 +3076,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
msp->ms_initializing == 0 &&
msp->ms_disabled == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {

for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -3330,7 +3359,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
metaslab_class_t *mc = msp->ms_group->mg_class;

VERIFY(!msp->ms_condensing);
VERIFY0(msp->ms_initializing);
VERIFY0(msp->ms_disabled);

start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -3341,6 +3370,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
range_tree_remove(rt, start, size);
range_tree_clear(msp->ms_trim, start, size);

if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
@@ -3391,10 +3421,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
}

/*
* If the selected metaslab is condensing or being
* initialized, skip it.
* If the selected metaslab is condensing or disabled,
* skip it.
*/
if (msp->ms_condensing || msp->ms_initializing > 0)
if (msp->ms_condensing || msp->ms_disabled > 0)
continue;

*was_active = msp->ms_allocator != -1;
@@ -3566,9 +3596,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
} else if (msp->ms_initializing > 0) {
} else if (msp->ms_disabled > 0) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_INITIALIZING, allocator);
TRACE_DISABLED, allocator);
metaslab_passivate(msp, msp->ms_weight &
~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
@@ -4294,6 +4324,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
msp->ms_size);
range_tree_remove(msp->ms_allocatable, offset, size);
range_tree_clear(msp->ms_trim, offset, size);

if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
@@ -4606,6 +4637,7 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
offset, size);
}

range_tree_verify_not_present(msp->ms_trim, offset, size);
range_tree_verify_not_present(msp->ms_freeing, offset, size);
range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
range_tree_verify_not_present(msp->ms_freed, offset, size);
@@ -4637,6 +4669,89 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
spa_config_exit(spa, SCL_VDEV, FTAG);
}

static void
metaslab_group_disable_wait(metaslab_group_t *mg)
{
ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
while (mg->mg_disabled_updating) {
cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
}
}

static void
metaslab_group_disabled_increment(metaslab_group_t *mg)
{
ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
ASSERT(mg->mg_disabled_updating);

while (mg->mg_ms_disabled >= max_disabled_ms) {
cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
}
mg->mg_ms_disabled++;
ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
}

/*
* Mark the metaslab as disabled to prevent any allocations on this metaslab.
* We must also track how many metaslabs are currently disabled within a
* metaslab group and limit them to prevent allocation failures from
* occurring because all metaslabs are disabled.
*/
void
metaslab_disable(metaslab_t *msp)
{
ASSERT(!MUTEX_HELD(&msp->ms_lock));
metaslab_group_t *mg = msp->ms_group;

mutex_enter(&mg->mg_ms_disabled_lock);

/*
* To keep an accurate count of how many threads have disabled
* a specific metaslab group, we only allow one thread to mark
* the metaslab group at a time. This ensures that the value of
* ms_disabled will be accurate when we decide to mark a metaslab
* group as disabled. To do this we force all other threads
* to wait till the metaslab's mg_disabled_updating flag is no
* longer set.
*/
metaslab_group_disable_wait(mg);
mg->mg_disabled_updating = B_TRUE;
if (msp->ms_disabled == 0) {
metaslab_group_disabled_increment(mg);
}
mutex_enter(&msp->ms_lock);
msp->ms_disabled++;
mutex_exit(&msp->ms_lock);

mg->mg_disabled_updating = B_FALSE;
cv_broadcast(&mg->mg_ms_disabled_cv);
mutex_exit(&mg->mg_ms_disabled_lock);
}

void
metaslab_enable(metaslab_t *msp, boolean_t sync)
{
metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa;

/*
* Wait for the outstanding IO to be synced to prevent newly
* allocated blocks from being overwritten. This used by
* initialize and TRIM which are modifying unallocated space.
*/
if (sync)
txg_wait_synced(spa_get_dsl(spa), 0);

mutex_enter(&mg->mg_ms_disabled_lock);
mutex_enter(&msp->ms_lock);
if (--msp->ms_disabled == 0) {
mg->mg_ms_disabled--;
cv_broadcast(&mg->mg_ms_disabled_cv);
}
mutex_exit(&msp->ms_lock);
mutex_exit(&mg->mg_ms_disabled_lock);
}

#if defined(_KERNEL)
/* BEGIN CSTYLED */
module_param(metaslab_aliquot, ulong, 0644);
@@ -57,6 +57,7 @@
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -132,7 +133,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
* macros. Other operations process a large amount of data; the ZTI_BATCH
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a a
* are so high frequency and short-lived that the taskq itself can become a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
@@ -150,6 +151,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
{ ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
};

static void spa_sync_version(void *arg, dmu_tx_t *tx);
@@ -554,6 +556,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
case ZPOOL_PROP_AUTOEXPAND:
case ZPOOL_PROP_AUTOTRIM:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = SET_ERROR(EINVAL);
@@ -1442,8 +1445,10 @@ spa_unload(spa_t *spa)
spa_async_suspend(spa);

if (spa->spa_root_vdev) {
vdev_initialize_stop_all(spa->spa_root_vdev,
VDEV_INITIALIZE_ACTIVE);
vdev_t *root_vdev = spa->spa_root_vdev;
vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
}

/*
@@ -3585,7 +3590,7 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
&spa->spa_dedup_ditto);

spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
spa->spa_autoreplace = (autoreplace != 0);
}

@@ -4336,6 +4341,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)

spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_initialize_restart(spa->spa_root_vdev);
vdev_trim_restart(spa->spa_root_vdev);
vdev_autotrim_restart(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}

@@ -5338,6 +5345,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);

if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
@@ -5746,14 +5754,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,

/*
* We're about to export or destroy this pool. Make sure
* we stop all initializtion activity here before we
* set the spa_final_txg. This will ensure that all
* we stop all initialization and trim activity here before
* we set the spa_final_txg. This will ensure that all
* dirty data resulting from the initialization is
* committed to disk before we unload the pool.
*/
if (spa->spa_root_vdev != NULL) {
vdev_initialize_stop_all(spa->spa_root_vdev,
VDEV_INITIALIZE_ACTIVE);
vdev_t *rvd = spa->spa_root_vdev;
vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
}

/*
@@ -6376,7 +6386,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_remove_parent(cvd);
}


/*
* We don't set tvd until now because the parent we just removed
* may have been the previous top-level vdev.
@@ -6490,7 +6499,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
* a previous initialization process which has completed but
* the thread is not exited.
*/
if (cmd_type == POOL_INITIALIZE_DO &&
if (cmd_type == POOL_INITIALIZE_START &&
(vd->vdev_initialize_thread != NULL ||
vd->vdev_top->vdev_removing)) {
mutex_exit(&vd->vdev_initialize_lock);
@@ -6507,7 +6516,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
}

switch (cmd_type) {
case POOL_INITIALIZE_DO:
case POOL_INITIALIZE_START:
vdev_initialize(vd);
break;
case POOL_INITIALIZE_CANCEL:
@@ -6571,6 +6580,126 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
return (total_errors);
}

static int
spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));

spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);

/* Look up vdev and ensure it's a leaf. */
vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd == NULL || vd->vdev_detached) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (SET_ERROR(ENODEV));
} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (SET_ERROR(EINVAL));
} else if (!vdev_writeable(vd)) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (SET_ERROR(EROFS));
} else if (!vd->vdev_has_trim) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (SET_ERROR(EOPNOTSUPP));
} else if (secure && !vd->vdev_has_securetrim) {
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (SET_ERROR(EOPNOTSUPP));
}
mutex_enter(&vd->vdev_trim_lock);
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);

/*
* When we activate a TRIM action we check to see if the
* vdev_trim_thread is NULL. We do this instead of using the
* vdev_trim_state since there might be a previous TRIM process
* which has completed but the thread is not exited.
*/
if (cmd_type == POOL_TRIM_START &&
(vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
mutex_exit(&vd->vdev_trim_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_TRIM_CANCEL &&
(vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
mutex_exit(&vd->vdev_trim_lock);
return (SET_ERROR(ESRCH));
} else if (cmd_type == POOL_TRIM_SUSPEND &&
vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
mutex_exit(&vd->vdev_trim_lock);
return (SET_ERROR(ESRCH));
}

switch (cmd_type) {
case POOL_TRIM_START:
vdev_trim(vd, rate, partial, secure);
break;
case POOL_TRIM_CANCEL:
vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
break;
case POOL_TRIM_SUSPEND:
vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
break;
default:
panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
}
mutex_exit(&vd->vdev_trim_lock);

return (0);
}

/*
* Initiates a manual TRIM for the requested vdevs. This kicks off individual
* TRIM threads for each child vdev. These threads pass over all of the free
* space in the vdev's metaslabs and issues TRIM commands for that space.
*/
int
spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
{
int total_errors = 0;
list_t vd_list;

list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));

/*
* We hold the namespace lock through the whole function
* to prevent any changes to the pool while we're starting or
* stopping TRIM. The config and state locks are held so that
* we can properly assess the vdev state before we commit to
* the TRIM operation.
*/
mutex_enter(&spa_namespace_lock);

for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
uint64_t vdev_guid = fnvpair_value_uint64(pair);

int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
rate, partial, secure, &vd_list);
if (error != 0) {
char guid_as_str[MAXNAMELEN];

(void) snprintf(guid_as_str, sizeof (guid_as_str),
"%llu", (unsigned long long)vdev_guid);
fnvlist_add_int64(vdev_errlist, guid_as_str, error);
total_errors++;
}
}

/* Wait for all TRIM threads to stop. */
vdev_trim_stop_wait(spa, &vd_list);

/* Sync out the TRIM state */
txg_wait_synced(spa->spa_dsl_pool, 0);
mutex_exit(&spa_namespace_lock);

list_destroy(&vd_list);

return (total_errors);
}

/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
@@ -6780,24 +6909,36 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_async_suspend(newspa);

/*
* Temporarily stop the initializing activity. We set the state to
* ACTIVE so that we know to resume the initializing once the split
* has completed.
* Temporarily stop the initializing and TRIM activity. We set the
* state to ACTIVE so that we know to resume initializing or TRIM
* once the split has completed.
*/
list_t vd_list;
list_create(&vd_list, sizeof (vdev_t),
list_t vd_initialize_list;
list_create(&vd_initialize_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));

list_t vd_trim_list;
list_create(&vd_trim_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));

for (c = 0; c < children; c++) {
if (vml[c] != NULL) {
mutex_enter(&vml[c]->vdev_initialize_lock);
vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
&vd_list);
vdev_initialize_stop(vml[c],
VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
mutex_exit(&vml[c]->vdev_initialize_lock);

mutex_enter(&vml[c]->vdev_trim_lock);
vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
mutex_exit(&vml[c]->vdev_trim_lock);
}
}
vdev_initialize_stop_wait(spa, &vd_list);
list_destroy(&vd_list);

vdev_initialize_stop_wait(spa, &vd_initialize_list);
vdev_trim_stop_wait(spa, &vd_trim_list);

list_destroy(&vd_initialize_list);
list_destroy(&vd_trim_list);

newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;

@@ -6899,8 +7040,10 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
vml[c]->vdev_offline = B_FALSE;
}

/* restart initializing disks as necessary */
/* restart initializing or trimming disks as necessary */
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);

vdev_reopen(spa->spa_root_vdev);

@@ -7283,6 +7426,22 @@ spa_async_thread(void *arg)
mutex_exit(&spa_namespace_lock);
}

if (tasks & SPA_ASYNC_TRIM_RESTART) {
mutex_enter(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_restart(spa->spa_root_vdev);
spa_config_exit(spa, SCL_CONFIG, FTAG);
mutex_exit(&spa_namespace_lock);
}

if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
mutex_enter(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_autotrim_restart(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
mutex_exit(&spa_namespace_lock);
}

/*
* Let the world know that we're done.
*/
@@ -7782,6 +7941,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
case ZPOOL_PROP_FAILUREMODE:
spa->spa_failmode = intval;
break;
case ZPOOL_PROP_AUTOTRIM:
spa->spa_autotrim = intval;
spa_async_request(spa,
SPA_ASYNC_AUTOTRIM_RESTART);
break;
case ZPOOL_PROP_AUTOEXPAND:
spa->spa_autoexpand = intval;
if (tx->tx_txg != TXG_INITIAL)
@@ -39,6 +39,7 @@
#include <sys/zil.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/metaslab.h>
@@ -1128,6 +1129,9 @@ spa_vdev_enter(spa_t *spa)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);

vdev_autotrim_stop_all(spa);

return (spa_vdev_config_enter(spa));
}

@@ -1204,8 +1208,17 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
NULL);
mutex_exit(&vd->vdev_initialize_lock);

mutex_enter(&vd->vdev_trim_lock);
vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
mutex_exit(&vd->vdev_trim_lock);
}

/*
* The vdev may be both a leaf and top-level device.
*/
vdev_autotrim_stop_wait(vd);

spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1227,6 +1240,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
vdev_autotrim_restart(spa);

spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
mutex_exit(&spa->spa_vdev_top_lock);
@@ -1923,6 +1938,12 @@ spa_deadman_synctime(spa_t *spa)
return (spa->spa_deadman_synctime);
}

spa_autotrim_t
spa_get_autotrim(spa_t *spa)
{
return (spa->spa_autotrim);
}

uint64_t
spa_deadman_ziotime(spa_t *spa)
{
@@ -887,6 +887,105 @@ spa_health_destroy(spa_t *spa)
mutex_destroy(&shk->lock);
}

static spa_iostats_t spa_iostats_template = {
{ "trim_extents_written", KSTAT_DATA_UINT64 },
{ "trim_bytes_written", KSTAT_DATA_UINT64 },
{ "trim_extents_skipped", KSTAT_DATA_UINT64 },
{ "trim_bytes_skipped", KSTAT_DATA_UINT64 },
{ "trim_extents_failed", KSTAT_DATA_UINT64 },
{ "trim_bytes_failed", KSTAT_DATA_UINT64 },
{ "autotrim_extents_written", KSTAT_DATA_UINT64 },
{ "autotrim_bytes_written", KSTAT_DATA_UINT64 },
{ "autotrim_extents_skipped", KSTAT_DATA_UINT64 },
{ "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
{ "autotrim_extents_failed", KSTAT_DATA_UINT64 },
{ "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
};

#define SPA_IOSTATS_ADD(stat, val) \
atomic_add_64(&iostats->stat.value.ui64, (val));

void
spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_written, uint64_t bytes_written,
uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed)
{
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat;
spa_iostats_t *iostats;

if (ksp == NULL)
return;

iostats = ksp->ks_data;
if (type == TRIM_TYPE_MANUAL) {
SPA_IOSTATS_ADD(trim_extents_written, extents_written);
SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
} else {
SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
}
}

int
spa_iostats_update(kstat_t *ksp, int rw)
{
if (rw == KSTAT_WRITE) {
memcpy(ksp->ks_data, &spa_iostats_template,
sizeof (spa_iostats_t));
}

return (0);
}

static void
spa_iostats_init(spa_t *spa)
{
spa_history_kstat_t *shk = &spa->spa_stats.iostats;

mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);

char *name = kmem_asprintf("zfs/%s", spa_name(spa));
kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);

shk->kstat = ksp;
if (ksp) {
int size = sizeof (spa_iostats_t);
ksp->ks_lock = &shk->lock;
ksp->ks_private = spa;
ksp->ks_update = spa_iostats_update;
ksp->ks_data = kmem_alloc(size, KM_SLEEP);
memcpy(ksp->ks_data, &spa_iostats_template, size);
kstat_install(ksp);
}

strfree(name);
}

static void
spa_iostats_destroy(spa_t *spa)
{
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat;
if (ksp) {
kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
kstat_delete(ksp);
}

mutex_destroy(&shk->lock);
}

void
spa_stats_init(spa_t *spa)
{
@@ -896,11 +995,13 @@ spa_stats_init(spa_t *spa)
spa_io_history_init(spa);
spa_mmp_history_init(spa);
spa_state_init(spa);
spa_iostats_init(spa);
}

void
spa_stats_destroy(spa_t *spa)
{
spa_iostats_destroy(spa);
spa_health_destroy(spa);
spa_tx_assign_destroy(spa);
spa_txg_history_destroy(spa);
@@ -694,8 +694,12 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
mutex_exit(&tx->tx_sync_lock);
}

/*
* Wait for the specified open transaction group. Set should_quiesce
* when the current open txg should be quiesced immediately.
*/
void
txg_wait_open(dsl_pool_t *dp, uint64_t txg)
txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
{
tx_state_t *tx = &dp->dp_tx;

@@ -705,7 +709,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
ASSERT3U(tx->tx_threads, ==, 2);
if (txg == 0)
txg = tx->tx_open_txg + 1;
if (tx->tx_quiesce_txg_waiting < txg)
if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
tx->tx_quiesce_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
@@ -51,6 +51,7 @@
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>

@@ -543,6 +544,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
list_link_init(&vd->vdev_leaf_node);
list_link_init(&vd->vdev_trim_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -551,6 +553,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);

for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
@@ -875,7 +883,10 @@ void
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;

ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);

/*
* Scan queues are normally destroyed at the end of a scan. If the
@@ -906,7 +917,6 @@ vdev_free(vdev_t *vd)

ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
ASSERT(vd->vdev_initialize_thread == NULL);

/*
* Discard allocation state.
@@ -988,6 +998,12 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_initialize_io_lock);
cv_destroy(&vd->vdev_initialize_io_cv);
cv_destroy(&vd->vdev_initialize_cv);
mutex_destroy(&vd->vdev_trim_lock);
mutex_destroy(&vd->vdev_autotrim_lock);
mutex_destroy(&vd->vdev_trim_io_lock);
cv_destroy(&vd->vdev_trim_cv);
cv_destroy(&vd->vdev_autotrim_cv);
cv_destroy(&vd->vdev_trim_io_cv);

zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -3475,6 +3491,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
}
mutex_exit(&vd->vdev_initialize_lock);

/* Restart trimming if necessary */
mutex_enter(&vd->vdev_trim_lock);
if (vdev_writeable(vd) &&
vd->vdev_trim_thread == NULL &&
vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
vd->vdev_trim_secure);
}
mutex_exit(&vd->vdev_trim_lock);

if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3745,8 +3771,7 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
int t;
for (t = 0; t < ZIO_TYPES; t++) {
for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
@@ -3873,7 +3898,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
/*
* Report intializing progress. Since we don't
* Report initializing progress. Since we don't
* have the initializing locks held, this is only
* an estimate (although a fairly accurate one).
*/
@@ -3884,9 +3909,20 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_initialize_state = vd->vdev_initialize_state;
vs->vs_initialize_action_time =
vd->vdev_initialize_action_time;

/*
* Report manual TRIM progress. Since we don't have
* the manual TRIM locks held, this is only an
* estimate (although fairly accurate one).
*/
vs->vs_trim_notsup = !vd->vdev_has_trim;
vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
vs->vs_trim_state = vd->vdev_trim_state;
vs->vs_trim_action_time = vd->vdev_trim_action_time;
}
/*
* Report expandable space on top-level, non-auxillary devices
* Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
* sized units since that determines how much space the pool
* can expand.
@@ -4004,9 +4040,18 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
*/
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
zio_type_t vs_type = type;

/*
* TRIM ops and bytes are reported to user space as
* ZIO_TYPE_IOCTL. This is done to preserve the
* vdev_stat_t structure layout for user space.
*/
if (type == ZIO_TYPE_TRIM)
vs_type = ZIO_TYPE_IOCTL;

vs->vs_ops[type]++;
vs->vs_bytes[type] += psize;
vs->vs_ops[vs_type]++;
vs->vs_bytes[vs_type] += psize;

if (flags & ZIO_FLAG_DELEGATED) {
vsx->vsx_agg_histo[zio->io_priority]
@@ -4104,7 +4149,8 @@ vdev_deflated_space(vdev_t *vd, int64_t space)
}

/*
* Update the in-core space usage stats for this vdev and the root vdev.
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
@@ -4650,12 +4696,56 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
spa->spa_resilver_deferred = B_TRUE;
}

/*
* Translate a logical range to the physical range for the specified vdev_t.
* This function is initially called with a leaf vdev and will walk each
* parent vdev until it reaches a top-level vdev. Once the top-level is
* reached the physical range is initialized and the recursive function
* begins to unwind. As it unwinds it calls the parent's vdev specific
* translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
} else {
/*
* We've reached the top-level vdev, initialize the
* physical range to the logical range and start to
* unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
return;
}

vdev_t *pvd = vd->vdev_parent;
ASSERT3P(pvd, !=, NULL);
ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);

/*
* As this recursive function unwinds, translate the logical
* range into its physical components by calling the
* vdev specific translate function.
*/
range_seg_t intermediate = { { { 0, 0 } } };
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);

physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}

#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);

/* BEGIN CSTYLED */
module_param(zfs_vdev_default_ms_count, int, 0644);
MODULE_PARM_DESC(zfs_vdev_default_ms_count,
@@ -30,6 +30,7 @@
#include <sys/spa_impl.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_trim.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
@@ -223,7 +224,7 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
strfree(argv[2]);
#endif /* HAVE_ELEVATOR_CHANGE */
if (error) {
zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
elevator, v->vdev_path, device, error);
}
}
@@ -322,7 +323,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,

if (IS_ERR(bdev)) {
int error = -PTR_ERR(bdev);
vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
vdev_dbgmsg(v, "open error=%d count=%d", error, count);
vd->vd_bdev = NULL;
v->vdev_tsd = vd;
rw_exit(&vd->vd_lock);
@@ -333,14 +334,22 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
rw_exit(&vd->vd_lock);
}

struct request_queue *q = bdev_get_queue(vd->vd_bdev);

/* Determine the physical block size */
block_size = vdev_bdev_block_size(vd->vd_bdev);

/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
v->vdev_nowritecache = B_FALSE;

/* Set when device reports it supports TRIM. */
v->vdev_has_trim = !!blk_queue_discard(q);

/* Set when device reports it supports secure TRIM. */
v->vdev_has_securetrim = !!blk_queue_discard_secure(q);

/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
v->vdev_nonrot = blk_queue_nonrot(q);

/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(vd->vd_bdev);
@@ -728,6 +737,7 @@ vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
unsigned long trim_flags = 0;
int rw, flags, error;

/*
@@ -813,6 +823,19 @@ vdev_disk_io_start(zio_t *zio)
#endif
break;

case ZIO_TYPE_TRIM:
#if defined(BLKDEV_DISCARD_SECURE)
if (zio->io_trim_flags & ZIO_TRIM_SECURE)
trim_flags |= BLKDEV_DISCARD_SECURE;
#endif
zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
trim_flags);

rw_exit(&vd->vd_lock);
zio_interrupt(zio);
return;

default:
rw_exit(&vd->vd_lock);
zio->io_error = SET_ERROR(ENOTSUP);
@@ -28,10 +28,13 @@
#include <sys/spa_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_trim.h>
#include <sys/zio.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/abd.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>

/*
* Virtual device vector for files.
@@ -60,9 +63,24 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
vattr_t vattr;
int error;

/* Rotational optimizations only make sense on block devices */
/*
* Rotational optimizations only make sense on block devices.
*/
vd->vdev_nonrot = B_TRUE;

/*
* Allow TRIM on file based vdevs. This may not always be supported,
* since it depends on your kernel version and underlying filesystem
* type but it is always safe to attempt.
*/
vd->vdev_has_trim = B_TRUE;

/*
* Disable secure TRIM on file based vdevs. There is no way to
* request this behavior from the underlying filesystem.
*/
vd->vdev_has_securetrim = B_FALSE;

/*
* We must have a pathname, and it must be absolute.
*/
@@ -227,6 +245,21 @@ vdev_file_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}

zio_execute(zio);
return;
} else if (zio->io_type == ZIO_TYPE_TRIM) {
struct flock flck;

ASSERT3U(zio->io_size, !=, 0);
bzero(&flck, sizeof (flck));
flck.l_type = F_FREESP;
flck.l_start = zio->io_offset;
flck.l_len = zio->io_size;
flck.l_whence = 0;

zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
0, 0, kcred, NULL);

zio_execute(zio);
return;
}
@@ -33,12 +33,6 @@
#include <sys/zap.h>
#include <sys/dmu_tx.h>

/*
* Maximum number of metaslabs per group that can be initialized
* simultaneously.
*/
int max_initialize_ms = 3;

/*
* Value that is written to disk during initialization.
*/
@@ -132,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
guid, 2, ZFS_SPACE_CHECK_NONE, tx);

switch (new_state) {
case VDEV_INITIALIZE_ACTIVE:
@@ -250,49 +244,6 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
return (0);
}

/*
* Translate a logical range to the physical range for the specified vdev_t.
* This function is initially called with a leaf vdev and will walk each
* parent vdev until it reaches a top-level vdev. Once the top-level is
* reached the physical range is initialized and the recursive function
* begins to unwind. As it unwinds it calls the parent's vdev specific
* translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
} else {
/*
* We've reached the top-level vdev, initialize the
* physical range to the logical range and start to
* unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
return;
}

vdev_t *pvd = vd->vdev_parent;
ASSERT3P(pvd, !=, NULL);
ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);

/*
* As this recursive function unwinds, translate the logical
* range into its physical components by calling the
* vdev specific translate function.
*/
range_seg_t intermediate = { { { 0, 0 } } };
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);

physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}

/*
* Callback to fill each ABD chunk with zfs_initialize_value. len must be
* divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
@@ -362,81 +313,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
return (0);
}

static void
vdev_initialize_mg_wait(metaslab_group_t *mg)
{
ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
while (mg->mg_initialize_updating) {
cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
}
}

static void
vdev_initialize_mg_mark(metaslab_group_t *mg)
{
ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
ASSERT(mg->mg_initialize_updating);

while (mg->mg_ms_initializing >= max_initialize_ms) {
cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
}
mg->mg_ms_initializing++;
ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
}

/*
* Mark the metaslab as being initialized to prevent any allocations
* on this metaslab. We must also track how many metaslabs are currently
* being initialized within a metaslab group and limit them to prevent
* allocation failures from occurring because all metaslabs are being
* initialized.
*/
static void
vdev_initialize_ms_mark(metaslab_t *msp)
{
ASSERT(!MUTEX_HELD(&msp->ms_lock));
metaslab_group_t *mg = msp->ms_group;

mutex_enter(&mg->mg_ms_initialize_lock);

/*
* To keep an accurate count of how many threads are initializing
* a specific metaslab group, we only allow one thread to mark
* the metaslab group at a time. This ensures that the value of
* ms_initializing will be accurate when we decide to mark a metaslab
* group as being initialized. To do this we force all other threads
* to wait till the metaslab's mg_initialize_updating flag is no
* longer set.
*/
vdev_initialize_mg_wait(mg);
mg->mg_initialize_updating = B_TRUE;
if (msp->ms_initializing == 0) {
vdev_initialize_mg_mark(mg);
}
mutex_enter(&msp->ms_lock);
msp->ms_initializing++;
mutex_exit(&msp->ms_lock);

mg->mg_initialize_updating = B_FALSE;
cv_broadcast(&mg->mg_ms_initialize_cv);
mutex_exit(&mg->mg_ms_initialize_lock);
}

static void
vdev_initialize_ms_unmark(metaslab_t *msp)
{
ASSERT(!MUTEX_HELD(&msp->ms_lock));
metaslab_group_t *mg = msp->ms_group;
mutex_enter(&mg->mg_ms_initialize_lock);
mutex_enter(&msp->ms_lock);
if (--msp->ms_initializing == 0) {
mg->mg_ms_initializing--;
cv_broadcast(&mg->mg_ms_initialize_cv);
}
mutex_exit(&msp->ms_lock);
mutex_exit(&mg->mg_ms_initialize_lock);
}

static void
vdev_initialize_calculate_progress(vdev_t *vd)
{
@@ -535,9 +411,8 @@ vdev_initialize_load(vdev_t *vd)
return (err);
}


/*
* Convert the logical range into a physcial range and add it to our
* Convert the logical range into a physical range and add it to our
* avl tree.
*/
void
@@ -618,24 +493,17 @@ vdev_initialize_thread(void *arg)
ms_count = vd->vdev_top->vdev_ms_count;
}

vdev_initialize_ms_mark(msp);
spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp);
mutex_enter(&msp->ms_lock);
VERIFY0(metaslab_load(msp));

range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd);
mutex_exit(&msp->ms_lock);

spa_config_exit(spa, SCL_CONFIG, FTAG);
error = vdev_initialize_ranges(vd, deadbeef);

/*
* Wait for the outstanding IO to be synced to prevent
* newly allocated blocks from being overwritten.
*/
txg_wait_synced(spa_get_dsl(spa), 0);

vdev_initialize_ms_unmark(msp);
metaslab_enable(msp, B_TRUE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
@@ -853,12 +721,11 @@ vdev_initialize_restart(vdev_t *vd)
}

#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_initialize_restart);
EXPORT_SYMBOL(vdev_xlate);
EXPORT_SYMBOL(vdev_initialize);
EXPORT_SYMBOL(vdev_initialize_stop);
EXPORT_SYMBOL(vdev_initialize_stop_all);
EXPORT_SYMBOL(vdev_initialize_stop_wait);
EXPORT_SYMBOL(vdev_initialize_restart);

/* CSTYLED */
module_param(zfs_initialize_value, ulong, 0644);
@@ -251,6 +251,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);

fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);

/* ZIOs pending */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
@@ -267,6 +270,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);

fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);

/* Histograms */
fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
vsx->vsx_total_histo[ZIO_TYPE_READ],
@@ -304,6 +310,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));

/* Request sizes */
fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
@@ -325,6 +335,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
@@ -345,6 +359,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));

/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);

@@ -156,6 +156,8 @@ uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
uint32_t zfs_vdev_initializing_min_active = 1;
uint32_t zfs_vdev_initializing_max_active = 1;
uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 2;

/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -203,6 +205,12 @@ int zfs_vdev_queue_depth_pct = 300;
*/
int zfs_vdev_def_queue_depth = 32;

/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
int zfs_vdev_aggregate_trim = 0;

int
vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -227,11 +235,13 @@ vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (&vq->vq_trim_offset_tree);
}

int
@@ -266,6 +276,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_removal_min_active);
case ZIO_PRIORITY_INITIALIZING:
return (zfs_vdev_initializing_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -338,6 +350,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_removal_max_active);
case ZIO_PRIORITY_INITIALIZING:
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -398,19 +412,25 @@ vdev_queue_init(vdev_t *vd)
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));

for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);

/*
* The synchronous i/o queues are dispatched in FIFO rather
* The synchronous/trim i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
if (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
else
} else {
compfn = vdev_queue_offset_compare;
}
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
@@ -428,6 +448,7 @@ vdev_queue_fini(vdev_t *vd)
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));

mutex_destroy(&vq->vq_lock);
}
@@ -559,6 +580,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);

/*
* While TRIM commands could be aggregated based on offset this
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);

first = last = zio;

if (zio->io_type == ZIO_TYPE_READ)
@@ -732,7 +760,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
* For LBA-ordered queues (async / scrub / initializing), issue the
* i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
@@ -783,19 +811,27 @@ vdev_queue_io(zio_t *zio)
* not match the child's i/o type. Fix it up here.
*/
if (zio->io_type == ZIO_TYPE_READ) {
ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);

if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);

if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
} else {
ASSERT(zio->io_type == ZIO_TYPE_TRIM);
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}

zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
@@ -922,6 +958,9 @@ module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating,
"Max vdev I/O aggregation size for non-rotating media");

module_param(zfs_vdev_aggregate_trim, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated");

module_param(zfs_vdev_read_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");

@@ -995,6 +1034,14 @@ module_param(zfs_vdev_sync_write_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
"Min active sync write I/Os per vdev");

module_param(zfs_vdev_trim_max_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_trim_max_active,
"Max active trim/discard I/Os per vdev");

module_param(zfs_vdev_trim_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_trim_min_active,
"Min active trim/discard I/Os per vdev");

module_param(zfs_vdev_queue_depth_pct, int, 0644);
MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
"Queue depth percentage for each top-level vdev");
@@ -37,7 +37,7 @@
#include <sys/vdev_raidz_impl.h>

#ifdef ZFS_DEBUG
#include <sys/vdev_initialize.h> /* vdev_xlate testing */
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
#endif

/*
@@ -45,6 +45,7 @@
#include <sys/vdev_indirect_mapping.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/trace_vdev.h>

/*
@@ -1181,6 +1182,8 @@ vdev_remove_complete(spa_t *spa)
txg = spa_vdev_enter(spa);
vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);

sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1869,8 +1872,10 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)

spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);

/* Stop initializing */
/* Stop initializing and TRIM */
vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
vdev_autotrim_stop_wait(vd);

*txg = spa_vdev_config_enter(spa);

@@ -2051,11 +2056,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
error = spa_reset_logs(spa);

/*
* We stop any initializing that is currently in progress but leave
* the state as "active". This will allow the initializing to resume
* if the removal is canceled sometime later.
* We stop any initializing and TRIM that is currently in progress
* but leave the state as "active". This will allow the process to
* resume if the removal is canceled sometime later.
*/
vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_wait(vd);

*txg = spa_vdev_config_enter(spa);

@@ -2069,6 +2076,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
return (error);
}