Skip to content

Commit

Permalink
Multipath autoreplace, control enclosure LEDs, event rate limiting
Browse files Browse the repository at this point in the history
1. Enable multipath autoreplace support for FMA.

This extends FMA autoreplace to work with multipath disks.  This
requires libdevmapper to be installed at build time.

2. Turn on/off fault LEDs when VDEVs become degraded/faulted/online

Set ZED_USE_ENCLOSURE_LEDS=1 in zed.rc to have ZED turn on/off the enclosure
LED for a drive when a drive becomes FAULTED/DEGRADED.  Your enclosure must
be supported by the Linux SES driver for this to work.  The enclosure LED
scripts work for multipath devices as well.  The scripts will clear the LED
when the fault is cleared.

3. Rate limit ZIO delay and checksum events so as not to flood ZED

ZIO delay and checksum events are rate limited to 5/sec in the zfs module.

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #2449 
Closes #3017 
Closes #5159
  • Loading branch information
tonyhutter authored and behlendorf committed Oct 19, 2016
1 parent 7c502b0 commit 6078881
Show file tree
Hide file tree
Showing 24 changed files with 668 additions and 61 deletions.
8 changes: 6 additions & 2 deletions cmd/zed/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ dist_zedexec_SCRIPTS = \
zed.d/io-notify.sh \
zed.d/io-spare.sh \
zed.d/resilver_finish-notify.sh \
zed.d/scrub_finish-notify.sh
zed.d/scrub_finish-notify.sh \
zed.d/statechange-led.sh \
zed.d/vdev_clear-led.sh

zedconfdefaults = \
all-syslog.sh \
Expand All @@ -70,7 +72,9 @@ zedconfdefaults = \
io-notify.sh \
io-spare.sh \
resilver_finish-notify.sh \
scrub_finish-notify.sh
scrub_finish-notify.sh \
statechange-blinkled.sh \
vdev_clear-blinkled.sh

install-data-hook:
$(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
Expand Down
90 changes: 64 additions & 26 deletions cmd/zed/agents/zfs_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
char rawpath[PATH_MAX], fullpath[PATH_MAX];
char devpath[PATH_MAX];
int ret;
int is_dm = 0;
uint_t c;
vdev_stat_t *vs;

if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
return;

/* Skip healthy disks */
verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
if (vs->vs_state == VDEV_STATE_HEALTHY) {
zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
__func__, path);
return;
}

(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
Expand All @@ -201,8 +213,13 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
if (offline)
return; /* don't intervene if it was taken offline */

zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s' (%llu)",
zpool_get_name(zhp), path, (long long unsigned int)guid);
#ifdef HAVE_LIBDEVMAPPER
is_dm = dev_is_dm(path);
#endif
zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
" wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path,
physpath ? physpath : "NULL", wholedisk, is_dm,
(long long unsigned int)guid);

/*
* The VDEV guid is preferred for identification (gets passed in path)
Expand All @@ -216,7 +233,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
*/
(void) strlcpy(fullpath, path, sizeof (fullpath));
if (wholedisk) {
char *spath = zfs_strip_partition(g_zfshdl, fullpath);
char *spath = zfs_strip_partition(fullpath);
if (!spath) {
zed_log_msg(LOG_INFO, "%s: Can't alloc",
__func__);
return;
}

(void) strlcpy(fullpath, spath, sizeof (fullpath));
free(spath);
Expand All @@ -241,8 +263,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* a true online (without the unspare flag), which will trigger a FMA
* fault.
*/
if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
!wholedisk || physpath == NULL) {
if (!is_dm && (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
!wholedisk || physpath == NULL)) {
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
Expand All @@ -255,7 +277,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
*/
(void) snprintf(rawpath, sizeof (rawpath), "%s%s", DEV_BYPATH_PATH,
physpath);
if (realpath(rawpath, devpath) == NULL) {
if (realpath(rawpath, devpath) == NULL && !is_dm) {
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
rawpath, strerror(errno));

Expand All @@ -267,10 +289,27 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return;
}

/*
* we're auto-replacing a raw disk, so label it first
*/
if (!labeled) {
if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL)) {
zed_log_msg(LOG_INFO, "%s: Autoreplace is not enabled on this"
" pool, ignore disk.", __func__);
return;
}

/* Only autoreplace bad disks */
if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
(vs->vs_state != VDEV_STATE_FAULTED) &&
(vs->vs_state != VDEV_STATE_CANT_OPEN)) {
return;
}

nvlist_lookup_string(vdev, "new_devid", &new_devid);

if (is_dm) {
/* Don't label device mapper or multipath disks. */
} else if (!labeled) {
/*
* we're auto-replacing a raw disk, so label it first
*/
char *leafname;

/*
Expand Down Expand Up @@ -311,7 +350,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
list_insert_tail(&g_device_list, device);

zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
leafname, (long long unsigned int)guid);
leafname, (u_longlong_t) guid);

return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */

Expand All @@ -337,16 +376,10 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
}

zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
physpath, (long long unsigned int)guid);

if (nvlist_lookup_string(vdev, "new_devid", &new_devid) != 0) {
zed_log_msg(LOG_INFO, " auto replace: missing devid!");
return;
}
physpath, (u_longlong_t) guid);

(void) snprintf(devpath, sizeof (devpath), "%s%s",
DEV_BYID_PATH, new_devid);
path = devpath;
}

/*
Expand Down Expand Up @@ -411,7 +444,7 @@ static void
zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
{
dev_data_t *dp = data;
char *path;
char *path = NULL;
uint_t c, children;
nvlist_t **child;

Expand Down Expand Up @@ -450,15 +483,15 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
* the dp->dd_compare value.
*/
if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
strcmp(dp->dd_compare, path) != 0) {
strcmp(dp->dd_compare, path) != 0)
return;
}

zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
dp->dd_prop, path);
dp->dd_found = B_TRUE;

/* pass the new devid for use by replacing code */
if (dp->dd_islabeled && dp->dd_new_devid != NULL) {
if (dp->dd_new_devid != NULL) {
(void) nvlist_add_string(nvl, "new_devid",
dp->dd_new_devid);
}
Expand Down Expand Up @@ -608,11 +641,11 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)

(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);

zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s)", devid,
devpath ? devpath : "NULL");

is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);

zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
devid, devpath ? devpath : "NULL", is_slice);

/*
* Iterate over all vdevs looking for a match in the folllowing order:
* 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
Expand Down Expand Up @@ -681,7 +714,12 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)

(void) strlcpy(fullpath, path, sizeof (fullpath));
if (wholedisk) {
char *spath = zfs_strip_partition(g_zfshdl, fullpath);
char *spath = zfs_strip_partition(fullpath);
if (!spath) {
zed_log_msg(LOG_INFO, "%s: Can't alloc",
__func__);
return (0);
}

(void) strlcpy(fullpath, spath, sizeof (fullpath));
free(spath);
Expand Down
88 changes: 88 additions & 0 deletions cmd/zed/zed.d/statechange-led.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash
#
# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
#
# Turn LED on if the VDEV becomes faulted/degraded, and turn it back off when
# it's healthy again. This requires that your enclosure be supported by the
# Linux SCSI enclosure services (ses) driver. The script will do nothing
# if you have no enclosure, or if your enclosure isn't supported.
#
# This script also requires ZFS to be built with libdevmapper support.
#
# Exit codes:
# 0: enclosure led successfully set
# 1: enclosure leds not not available
# 2: enclosure leds administratively disabled
# 3: ZED built without libdevmapper

[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"

# ZEVENT_VDEV_UPATH will not be present if ZFS is not built with libdevmapper
[ -n "${ZEVENT_VDEV_UPATH}" ] || exit 3

if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then
exit 2
fi

if [ ! -d /sys/class/enclosure ] ; then
exit 1
fi

# Turn on/off enclosure LEDs
function led
{
name=$1
val=$2

# We want to check the current state first, since writing to the
# 'fault' entry always always causes a SES command, even if the
# current state is already what you want.
if [ -e /sys/block/$name/device/enclosure_device*/fault ] ; then
# We have to do some monkey business to deal with spaces in
# enclosure_device names. I've seen horrible things like this:
#
# '/sys/block/sdfw/device/enclosure_device:SLOT 43 41 /fault'
#
# ...so escape all spaces.
file=`ls /sys/block/$name/device/enclosure_device*/fault | sed 's/\s/\\ /g'`

current=`cat "$file"`

# On some enclosures if you write 1 to fault, and read it back,
# it will return 2. Treat all non-zero values as 1 for
# simplicity.
if [ "$current" != "0" ] ; then
current=1
fi

if [ "$current" != "$val" ] ; then
# Set the value twice. I've seen enclosures that were
# flakey about setting it the first time.
echo $val > "$file"
echo $val > "$file"
fi
fi
}

# Decide whether to turn on/off an LED based on the state
# Pass in path name and fault string ("ONLINE"/"FAULTED"/"DEGRADED"...etc)
function process {
# path=/dev/sda, fault=

path=$1
fault=$2
name=`basename $path`

if [ -z "$name" ] ; then
return
fi

if [ "$fault" == "FAULTED" ] || [ "$fault" == "DEGRADED" ] ; then
led $name 1
else
led $name 0
fi
}

process "$ZEVENT_VDEV_UPATH" "$ZEVENT_VDEV_STATE_STR"
1 change: 1 addition & 0 deletions cmd/zed/zed.d/vdev_clear-led.sh
8 changes: 8 additions & 0 deletions cmd/zed/zed.d/zed.rc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@
#
#ZED_SPARE_ON_IO_ERRORS=1

##
# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for
# device mapper and multipath devices as well. Your enclosure must be
# supported by the Linux SES driver for this to work.
#
ZED_USE_ENCLOSURE_LEDS=1


##
# The syslog priority (e.g., specified as a "facility.level" pair).
#
Expand Down
24 changes: 21 additions & 3 deletions cmd/zed/zed_disk_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ static void *
zed_udev_monitor(void *arg)
{
struct udev_monitor *mon = arg;
char *tmp, *tmp2;

zed_log_msg(LOG_INFO, "Waiting for new uduev disk events...");

Expand Down Expand Up @@ -284,9 +285,26 @@ zed_udev_monitor(void *arg)
if (strcmp(class, EC_DEV_STATUS) == 0 &&
udev_device_get_property_value(dev, "DM_UUID") &&
udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
/* Fake a MP "change" event to look like a "create" */
class = EC_DEV_ADD;
subclass = ESC_DISK;
tmp = (char *) udev_device_get_devnode(dev);
tmp2 = get_underlying_path(NULL, tmp);
if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
/*
* We have a real underlying device, which
* means that this multipath "change" event is
* an "add" event.
*
* If the multipath device and the underlying
* dev are the same name (i.e. /dev/dm-5), then
* there is no real underlying disk for this
* multipath device, and so this "change" event
* really a multipath removal.
*/
class = EC_DEV_ADD;
subclass = ESC_DISK;
} else {
/* multipath remove, ignore it. */
}
free(tmp2);
}

if ((nvl = dev_event_nvlist(dev)) != NULL) {
Expand Down
28 changes: 28 additions & 0 deletions cmd/zed/zed_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,23 @@ _zed_internal_event(const char *class, nvlist_t *nvl)
}
}

static void
_zed_event_add_upath(uint64_t eid, zed_strings_t *zsp, nvlist_t *nvl)
{
char *path = NULL;
char *upath = NULL;
if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
&path) == 0) {
upath = get_underlying_path(NULL, path);
if (upath) {
_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX,
"VDEV_UPATH",
"%s", upath);
free(upath);
}
}
}

/*
* Service the next zevent, blocking until one is available.
*/
Expand Down Expand Up @@ -912,8 +929,19 @@ zed_event_service(struct zed_conf *zcp)
subclass = _zed_event_get_subclass(class);
_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS",
"%s", (subclass ? subclass : class));

_zed_event_add_time_strings(eid, zsp, etime);

/*
* If a VDEV is included, resolve it's path to the "underlying
* device". This is useful for resolving device mapper and
* multipath devices to their underlying /dev/sd* devices.
* For example, if you have a DM or multipath VDEV
* (/dev/mapper/mpatha) that points to one or more /dev/sd*
* devices, this will return the first of its devices.
*/
_zed_event_add_upath(eid, zsp, nvl);

zed_exec_process(eid, class, subclass,
zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);

Expand Down
Loading

0 comments on commit 6078881

Please sign in to comment.