diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index d8e73337622c..a1595c76540c 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -103,12 +103,12 @@ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ TP_PROTO(arc_buf_hdr_t *ab), \ TP_ARGS(ab)) DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); -DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); @@ -387,12 +387,12 @@ DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); #else DEFINE_DTRACE_PROBE1(arc__hit); +DEFINE_DTRACE_PROBE1(arc__iohit); DEFINE_DTRACE_PROBE1(arc__evict); DEFINE_DTRACE_PROBE1(arc__delete); DEFINE_DTRACE_PROBE1(new_state__mru); DEFINE_DTRACE_PROBE1(new_state__mfu); DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); -DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch); DEFINE_DTRACE_PROBE1(l2arc__hit); DEFINE_DTRACE_PROBE1(l2arc__miss); DEFINE_DTRACE_PROBE2(l2arc__read); diff --git a/include/sys/arc.h b/include/sys/arc.h index 532a2fe4bc03..b2f8f20e16e5 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -115,7 +115,6 @@ typedef enum arc_flags ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 03eebafa9952..f4e3c37c3757 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -101,6 +101,9 @@ struct arc_callback { boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; + boolean_t acb_wait; + kmutex_t acb_wait_lock; + kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; @@ -174,6 +177,7 @@ typedef struct l1arc_buf_hdr { zfs_refcount_t b_refcnt; arc_callback_t *b_acb; + arc_callback_t **b_acbp; abd_t *b_pabd; } l1arc_buf_hdr_t; @@ -512,14 +516,19 @@ struct arc_buf_hdr { typedef struct arc_stats { kstat_named_t arcstat_hits; + kstat_named_t arcstat_iohits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; @@ -844,8 +853,12 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_predictive_prefetch; kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_iohit_predictive_prefetch; + kstat_named_t arcstat_prescient_prefetch; kstat_named_t arcstat_demand_hit_prescient_prefetch; + kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; @@ -855,14 +868,19 @@ typedef struct arc_stats { typedef struct arc_sums { wmsum_t arcstat_hits; + wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; + wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; + wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; + wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; + wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; @@ -936,8 +954,12 @@ typedef struct arc_sums { wmsum_t arcstat_prune; aggsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; + wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; + wmsum_t arcstat_demand_iohit_predictive_prefetch; + wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; + wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1f97631f974f..8b03ea9caa40 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -857,7 +857,7 @@ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -1138,9 +1138,8 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_l1hdr.b_arc_node); - list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -2283,31 +2282,20 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { - arc_state_t *state; + arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } - state = hdr->b_l1hdr.b_state; - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { + state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(&state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_decrement(hdr, state); } } @@ -2317,13 +2305,13 @@ add_reference(arc_buf_hdr_t *hdr, const void *tag) * list making it eligible for eviction. */ static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) +remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* @@ -2333,7 +2321,6 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); @@ -2394,8 +2381,7 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; @@ -2424,7 +2410,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } update_new = update_old; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); @@ -3487,6 +3473,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; + nhdr->b_l1hdr.b_acbp = hdr->b_l1hdr.b_acbp; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; /* @@ -3531,6 +3518,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_acb = NULL; + hdr->b_l1hdr.b_acbp = NULL; hdr->b_l1hdr.b_pabd = NULL; if (ocache == hdr_full_crypt_cache) { @@ -3853,7 +3841,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); + VERIFY0(remove_reference(hdr, tag)); arc_hdr_destroy(hdr); return; } @@ -3867,7 +3855,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } @@ -3903,11 +3891,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* @@ -3934,7 +3922,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, hdr, hash_lock); + arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. @@ -3943,7 +3931,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } @@ -3954,10 +3942,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - MSEC_TO_TICK(min_lifetime))) { + MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4022,7 +4009,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); - arc_change_state(evicted_state, hdr, hash_lock); + arc_change_state(evicted_state, hdr); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); @@ -5443,150 +5430,129 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) /* * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + /* + * Update buffer prefetch status. + */ + boolean_t was_prefetch = HDR_PREFETCH(hdr); + boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; + if (was_prefetch) { + if (!now_prefetch) { + ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, + HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, + prefetch); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } + } else if (now_prefetch) { + if (HDR_PRESCIENT_PREFETCH(hdr)) + ARCSTAT_BUMP(arcstat_prescient_prefetch); + else + ARCSTAT_BUMP(arcstat_predictive_prefetch); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } + if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + + clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); + arc_change_state(arc_mru, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); + /* + * This buffer has been accessed once recently and either + * its read is still in progress or it is in the cache. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_arc_access = now; + return; + } + hdr->b_l1hdr.b_mru_hits++; + ARCSTAT_BUMP(arcstat_mru_hits); /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). + * If the previous access was a prefetch, then it already + * handled possible promotion, so nothing more to do for now. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } + if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. + * If more than ARC_MINTIME have passed from the previous + * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + arc_change_state(arc_mfu, hdr); } - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { - arc_state_t *new_state; /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. + * This buffer has been accessed once recently, but was + * evicted from the cache. Would we have bigger MRU it + * would be promoted to MFU now, so restore the justice. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - } else { - new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. + * This buffer has been accessed more than once and either + * still in the cache or being restored from one of ghosts. */ - - hdr->b_l1hdr.b_mfu_hits++; - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + if (!HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_mfu_hits++; + ARCSTAT_BUMP(arcstat_mfu_hits); + } + hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. + * This buffer has been accessed more than once recently, but + * has been evicted from the cache. Would we have bigger MFU + * it would stay in cache, so move it back to MFU state. */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* - * This buffer is on the 2nd Level ARC. + * This buffer is on the 2nd Level ARC and was not accessed + * for a long time, so treat it as new and put into MRU. */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); @@ -5629,12 +5595,12 @@ arc_buf_access(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); + ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, + !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ @@ -5768,17 +5734,6 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } - /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were @@ -5851,7 +5806,9 @@ arc_read_done(zio_t *zio) ASSERT(callback_cnt < 2 || hash_lock != NULL); hdr->b_l1hdr.b_acb = NULL; + hdr->b_l1hdr.b_acbp = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ if (callback_cnt == 0) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -5863,7 +5820,7 @@ arc_read_done(zio_t *zio) } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); @@ -5912,7 +5869,13 @@ arc_read_done(zio_t *zio) } callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); + if (acb->acb_wait) { + mutex_enter(&acb->acb_wait_lock); + acb->acb_wait = B_FALSE; + cv_signal(&acb->acb_wait_cv); + mutex_exit(&acb->acb_wait_lock); + } else + kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) @@ -6002,12 +5965,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { + boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); @@ -6015,6 +5976,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, goto out; } + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { @@ -6028,19 +5990,18 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } + + DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. - * Synchronous reads use the b_cv whereas nowait reads - * register a callback. Both are signalled/called in - * arc_read_done. + * Synchronous reads use the acb_wait_cv whereas nowait + * reads register a callback. Both are signalled/called + * in arc_read_done. * * Errors of the physical I/O may need to be propagated * to the pio. For synchronous reads, we simply restart @@ -6049,17 +6010,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ - - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - + arc_callback_t *acb = NULL; + if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; @@ -6068,17 +6020,39 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; + if (*arc_flags & ARC_FLAG_WAIT) { + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, + CV_DEFAULT, NULL); + } acb->acb_zb = *zb; - if (pio != NULL) + if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); + } acb->acb_zio_head = head_zio; - acb->acb_next = hdr->b_l1hdr.b_acb; - hdr->b_l1hdr.b_acb = acb; + *hdr->b_l1hdr.b_acbp = acb; + hdr->b_l1hdr.b_acbp = &acb->acb_next; } mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_iohits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, iohits); + + if (*arc_flags & ARC_FLAG_WAIT) { + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); + } goto out; } @@ -6086,28 +6060,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, hdr->b_l1hdr.b_state == arc_mfu); if (done && !no_buf) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } - ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ @@ -6129,8 +6081,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } } if (rc != 0) { - (void) remove_reference(hdr, hash_lock, - private); + (void) remove_reference(hdr, private); arc_buf_destroy_impl(buf); buf = NULL; } @@ -6138,25 +6089,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + arc_access(hdr, *arc_flags, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, hits); + *arc_flags |= ARC_FLAG_CACHED; if (done) done(NULL, zb, bp, buf, private); @@ -6237,19 +6177,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } /* - * This is a delicate dance that we play here. - * This hdr might be in the ghost list so we access - * it to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). + * Call arc_adapt() explicitly before arc_access() + * to allow ghost states logic to balance MRU/MFU. */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); } + add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + if (!embedded_bp) + arc_access(hdr, *arc_flags, B_FALSE); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); @@ -6276,24 +6213,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -6306,7 +6229,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + hdr->b_l1hdr.b_acbp = &acb->acb_next; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { @@ -6347,7 +6270,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); @@ -6369,7 +6292,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + !(l2arc_noprefetch && + (*arc_flags & ARC_FLAG_PREFETCH))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; @@ -6558,10 +6482,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp) /* * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is + * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, + * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU @@ -6578,9 +6500,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); + if (!HDR_HAS_L1HDR(hdr) || + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { @@ -6623,7 +6545,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; @@ -6683,7 +6605,7 @@ arc_release(arc_buf_t *buf, const void *tag) VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -6798,7 +6720,7 @@ arc_release(arc_buf_t *buf, const void *tag) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); @@ -6872,10 +6794,12 @@ arc_write_ready(zio_t *zio) callback->awcb_ready(zio, buf, callback->awcb_private); - if (HDR_IO_IN_PROGRESS(hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + } if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); @@ -7062,7 +6986,7 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); + arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); @@ -7082,12 +7006,14 @@ arc_write_done(zio_t *zio) } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -7302,22 +7228,32 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); + as->arcstat_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); + as->arcstat_demand_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); + as->arcstat_demand_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); + as->arcstat_prefetch_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); + as->arcstat_prefetch_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = @@ -7500,10 +7436,18 @@ arc_kstat_update(kstat_t *ksp, int rw) aggsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); + as->arcstat_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); + as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + as->arcstat_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); + as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = @@ -7735,14 +7679,19 @@ arc_state_init(void) zfs_refcount_create(&arc_l2c_only->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); + wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); @@ -7816,8 +7765,12 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_prune, 0); aggsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); + wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); @@ -7865,14 +7818,19 @@ arc_state_fini(void) multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); + wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); + wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); + wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); @@ -7946,8 +7904,12 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_prune); aggsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); + wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); @@ -9258,7 +9220,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 377634c72bba..244b9b4cbcbc 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -185,7 +185,8 @@ static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 1d63d7de65a1..76b8b5608a53 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -517,13 +517,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } if (!have_lock)