use sharded associative cache for index entry cache (openzfs#808)

When the zettacache does an index merge, it also adds the new and changed entries to the index entry cache. In certain configurations, manipulating the index entry cache can be very time consuming and also have a big impact on the performance of concurrent zettacache activities. This is especially noticeable when the zettacache doesn't have a lot of data in it, e.g. when initially filling up. Additionally, the current data structure used for the index entry cache is not very efficient with memory; a lot of memory is used by its internal overheads. This commit changes the data structure used by the index entry cache to be a sharded 16-way associative roughly-LRU cache. Each entry can be stored in any of 16 "slots", which are searched when doing a lookup. When inserting and all 16 slots are full, the slot whose IndexValue has the oldest Atime is evicted. Each shard of the index is locked separately, allowing concurrent access to the overall entry cache. This improves performance in several ways: The index entry cache can be updated concurrently with lookups, so zettacache lookup/insert performance is not impacted as much by merging. On a workload of random reads causing inserts to the zettacache via sibling block ingestion, without this commit a merge causes insertion performance to drop to ~45% (420,000 -> 190,000 inserts/sec). The time to update the index entry cache is reduced, so the overall time to do a merge is reduced. The time to perform a merge when the index size is small, is reduced to 20% (5x improvement, 93 -> 19 seconds). The number of entries that can be cached in the given RAM budget is roughly trippled. The new memory usage per entry is 37% of previous (65 -> 24 bytes per entry; the IndexEntry size is 23 bytes).
pcd1193182 · Apr 21, 2023 · 1f28ff8 · 1f28ff8
1 parent 4effc6c
commit 1f28ff8
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 52 deletions.
diff --git a/cmd/zfs_object_agent/Cargo.lock b/cmd/zfs_object_agent/Cargo.lock
diff --git a/cmd/zfs_object_agent/util/Cargo.toml b/cmd/zfs_object_agent/util/Cargo.toml
@@ -9,6 +9,7 @@ publish = false
 [dependencies]
 anyhow = "1.0"
 arr_macro = "0.2.1"
+associative-cache = "1.0.1"
 async-trait = "0.1.68"
 atty = "0.2"
 atomic-counter = "1.0.1"
@@ -23,6 +24,7 @@ dashmap = "5.1.0"
 derivative = "2.2.0"
 enum-map = { version = "2.5.0", features = ["serde"] }
 futures = "0.3.26"
+fxhash = "0.2.1"
 humantime = "2.1.0"
 itertools = "0.10.5"
 iset = "0.2.2"

diff --git a/cmd/zfs_object_agent/util/src/cache.rs b/cmd/zfs_object_agent/util/src/cache.rs
@@ -0,0 +1,121 @@
+use std::hash::Hash;
+use std::sync::Mutex;
+use std::sync::MutexGuard;
+
+use associative_cache::AssociativeCache;
+use associative_cache::Capacity;
+use associative_cache::Capacity1024;
+use associative_cache::HashSixteenWay;
+use associative_cache::Replacement;
+use derivative::Derivative;
+use fxhash::hash64;
+use fxhash::FxHasher;
+
+use crate::from64::AsUsize;
+
+/// A fixed-size, sharded, 16-way associative cache.  Each shard has 1024 entries.  Each entry
+/// can be stored in any of 16 "slots", which are searched when doing a lookup.  When inserting
+/// and all 16 slots are occupied, the slot with the smallest LruTimestamp is evicted.  Each
+/// shard of the index is locked separately, allowing concurrent access to the overall entry
+/// cache.
+pub struct Cache<K, V: LruTimestampOwned> {
+    inner: Vec<Mutex<Inner<K, V>>>,
+    mask: usize,
+}
+
+type Cap = Capacity1024;
+
+#[derive(Derivative)]
+#[derivative(Default(bound = ""))]
+struct Inner<K, V: LruTimestampOwned> {
+    cache: AssociativeCache<K, V, Cap, HashSixteenWay<FxHasher>, LruReplacementOwned>,
+}
+
+impl<K: Hash + PartialEq, V: LruTimestampOwned + Clone> Cache<K, V> {
+    /// Each shard has 1024 entries.  The capacity is rounded up to the next power-of-two number
+    /// of shards.
+    pub fn new(capacity: usize) -> Self {
+        let shards = ((capacity + Cap::CAPACITY - 1) / Cap::CAPACITY).next_power_of_two();
+        let inner = (0..shards).map(|_| Default::default()).collect();
+        let mask = (1 << shards.trailing_zeros() as usize) - 1;
+        Self { inner, mask }
+    }
+
+    fn inner(&self, key: &K) -> MutexGuard<Inner<K, V>> {
+        // The low bits are used to select the slot within the shard. The rest are used to select
+        // the shard.
+        let index = (hash64(key).as_usize() / Cap::CAPACITY) & self.mask;
+        self.inner[index].lock().unwrap()
+    }
+
+    /// Insert a new entry into the cache.
+    ///
+    /// If there is an old entry for this key, or if another entry ends up
+    /// getting replaced by this new one, return the old entry.
+    #[inline]
+    pub fn insert(&self, key: K, value: V) -> Option<(K, V)> {
+        self.inner(&key).cache.insert(key, value)
+    }
+
+    /// Return a clone of the value for a given key, if it exists in the cache.
+    #[inline]
+    pub fn get(&self, key: &K) -> Option<V> {
+        self.inner(key).cache.get(key).cloned()
+    }
+
+    /// Remove an entry from the cache.
+    ///
+    /// If an entry for the key existed in the cache, it is removed and `Some`
+    /// is returned. Otherwise, `None` is returned.
+    #[inline]
+    pub fn remove(&self, key: &K) -> Option<V> {
+        self.inner(key).cache.remove(key)
+    }
+}
+
+/// Like associative_cache::LruTimestamp, but the timestamp is returned by value instead of
+/// needing to be a reference into the object.
+pub trait LruTimestampOwned {
+    type Timestamp: PartialOrd;
+    fn get_timestamp(&self) -> Self::Timestamp;
+    fn update_timestamp(&self);
+}
+
+#[derive(Default)]
+struct LruReplacementOwned;
+
+impl<V, C> Replacement<V, C> for LruReplacementOwned
+where
+    C: Capacity,
+    V: LruTimestampOwned,
+{
+    #[inline]
+    fn choose_for_replacement<'a>(
+        &mut self,
+        candidates: impl ExactSizeIterator<Item = (usize, &'a V)>,
+    ) -> usize
+    where
+        V: 'a,
+    {
+        candidates
+            .fold(None, |lru, (index, value)| {
+                let timestamp = value.get_timestamp();
+                match lru {
+                    Some((t, i)) if t < timestamp => Some((t, i)),
+                    _ => Some((timestamp, index)),
+                }
+            })
+            .expect("candidates should not be empty")
+            .1
+    }
+
+    #[inline]
+    fn on_hit(&self, value: &V) {
+        value.update_timestamp();
+    }
+
+    #[inline]
+    fn on_insert(&self, value: &V) {
+        value.update_timestamp();
+    }
+}
diff --git a/cmd/zfs_object_agent/util/src/lib.rs b/cmd/zfs_object_agent/util/src/lib.rs
@@ -8,6 +8,7 @@ pub mod async_cache;
 mod binaryindextree;
 mod bitrange;
 mod btreemap_ext;
+pub mod cache;
 pub mod cffi;
 pub mod concurrent_batch;
 mod credentials;

diff --git a/cmd/zfs_object_agent/zettacache/src/index.rs b/cmd/zfs_object_agent/zettacache/src/index.rs
@@ -12,6 +12,7 @@ use serde::de::Error;
 use serde::de::Visitor;
 use serde::Deserialize;
 use serde::Serialize;
+use util::cache::LruTimestampOwned;
 use util::cffi;
 use util::tunable;
 use util::writeln_stdout;
@@ -102,6 +103,19 @@ impl IndexValue {
     }
 }
 
+impl LruTimestampOwned for IndexValue {
+    type Timestamp = Atime;
+
+    fn get_timestamp(&self) -> Self::Timestamp {
+        self.atime
+    }
+
+    fn update_timestamp(&self) {
+        // The atime will be updated by replacing the value in the cache, so this is a no-op.
+        // See zettacache::Inner::checkpoint_task().
+    }
+}
+
 #[derive(Debug, Cffi, Copy, Clone)]
 #[repr(C)]
 pub struct IndexEntry {