From 1eb5bfa3dcdaecb19543d9df13131374a7a42947 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 21 Dec 2012 14:57:09 -0800 Subject: [PATCH] Illumos #3145, #3212 3145 single-copy arc 3212 ztest: race condition between vdev_online() and spa_vdev_remove() Reviewed by: Matt Ahrens Reviewed by: Adam Leventhal Reviewed by: Eric Schrock Reviewed by: Justin T. Gibbs Approved by: Eric Schrock References: illumos-gate/commit/9253d63df408bb48584e0b1abfcc24ef2472382e illumos changeset: 13840:97fd5cdf328a https://www.illumos.org/issues/3145 https://www.illumos.org/issues/3212 Ported-by: Brian Behlendorf Closes #989 Closes #1137 --- cmd/ztest/ztest.c | 11 +++++++ include/sys/arc.h | 1 + module/zfs/arc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++- module/zfs/dbuf.c | 19 ++++++++++- 4 files changed, 113 insertions(+), 2 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index c9fdf466f..07b81cc27 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4965,7 +4965,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (islog) (void) rw_exit(&ztest_name_lock); } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); + mutex_exit(&ztest_vdev_lock); } } diff --git a/include/sys/arc.h b/include/sys/arc.h index 443597df0..dbc91ea3d 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -109,6 +109,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5d9b34fbf..5e21b1b71 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -189,6 +189,7 @@ unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; int zfs_arc_meta_prune = 0; /* @@ -307,6 +308,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; @@ -387,6 +391,9 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, @@ -1369,6 +1376,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1467,6 +1485,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1651,6 +1679,48 @@ arc_buf_size(arc_buf_t *buf) return (buf->b_hdr->b_size); } +/* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. @@ -2753,8 +2823,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3324,6 +3396,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 99f728f4b..205abaada 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2189,7 +2189,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx);