From 51eb3d01885072a2c604cdd65b2614c77d118046 Mon Sep 17 00:00:00 2001 From: Paul Khuong Date: Sun, 12 Sep 2021 22:28:32 -0400 Subject: [PATCH] multiplicative_hash: new module for const-fn mix + remap Sharded caches need a deterministic function to mix potentially dodgy hash values and map them to a range. Steal the logic (and improve it by fully implementing Dietzfelbinger's multiplicative hash), and make it possible to initialise the parameters with a compile-time sha256, instead of manually inputting pseudorandom values. TESTED=new and existing tests. --- Cargo.toml | 1 + src/lib.rs | 1 + src/multiplicative_hash.rs | 175 +++++++++++++++++++++++++++++++++++++ src/sharded.rs | 20 ++--- 4 files changed, 187 insertions(+), 10 deletions(-) create mode 100644 src/multiplicative_hash.rs diff --git a/Cargo.toml b/Cargo.toml index 316e356..815168b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" license = "MIT" [dependencies] +extendhash = "1" filetime = "0.2" rand = "0.8" tempfile = "3" diff --git a/src/lib.rs b/src/lib.rs index 0bf11ca..bb05c91 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -224,6 +224,7 @@ //! files or directories that start with a `.`, as long as they do not //! collide with the `.kismet` prefix. mod cache_dir; +mod multiplicative_hash; pub mod plain; pub mod raw_cache; mod readonly; diff --git a/src/multiplicative_hash.rs b/src/multiplicative_hash.rs new file mode 100644 index 0000000..5d6c364 --- /dev/null +++ b/src/multiplicative_hash.rs @@ -0,0 +1,175 @@ +/// Multiplicative hash structs implement +/// [Dietzfelbinger's universal multiplicative hash function](https://link.springer.com/chapter/10.1007/978-3-319-98355-4_15) +/// with `const fn` keyed constructors, and pair that with a range +/// reduction function from `u64` to a `usize` range that extends +/// Dietzfelbinger's power-of-two scheme. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(crate) struct MultiplicativeHash { + // Pseudorandom odd multiplier + multiplier: u64, + // Pseudorandom value added to the product + addend: u64, +} + +/// Maps vaues in `[0, u64::MAX]` to `[0, domain)` linearly. +/// +/// As a special case, this function returns 0 instead of erroring out +/// when `domain == 0`. +#[inline(always)] +const fn reduce(x: u64, domain: usize) -> usize { + ((domain as u128 * x as u128) >> 64) as usize +} + +impl MultiplicativeHash { + /// Constructs a `MultiplicativeHash` with the arguments as parameters. + /// The multiplier is converted to an odd integer if necessary. + pub const fn new(multiplier: u64, addend: u64) -> MultiplicativeHash { + MultiplicativeHash { + multiplier: multiplier | 1, + addend, + } + } + + /// Deterministically constructs a `MultiplicativeHash` with + /// parameters derived from the `key`, wih a SHA-256 hash. + pub const fn new_keyed(key: &[u8]) -> MultiplicativeHash { + use extendhash::sha256; + + let hash = sha256::compute_hash(key); + let multiplier = [ + hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7], + ]; + let addend = [ + hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15], + ]; + + MultiplicativeHash::new(u64::from_le_bytes(multiplier), u64::from_le_bytes(addend)) + } + + /// Constructs a new pseudorandom `MultiplicativeHash`. + pub fn new_random() -> MultiplicativeHash { + use rand::Rng; + + let mut rnd = rand::thread_rng(); + MultiplicativeHash::new(rnd.gen(), rnd.gen()) + } + + /// Mixes `value` with this hash's parameters. If you must + /// truncate the result, use its high bits. + #[inline(always)] + pub const fn mix(&self, value: u64) -> u64 { + value + .wrapping_mul(self.multiplier) + .wrapping_add(self.addend) + } + + /// Mixes `value` and maps the result to a usize less than range. + /// + /// If `range == 0`, always returns 0. + #[inline(always)] + pub const fn map(&self, value: u64, range: usize) -> usize { + reduce(self.mix(value), range) + } +} + +/// Smoke test the `reduce` function. +#[test] +fn test_reduce() { + // Mapping to an empty range should always return 0. + assert_eq!(reduce(0, 0), 0); + assert_eq!(reduce(u64::MAX, 0), 0); + + // Smoke test the range reduction + assert_eq!(reduce(0, 17), 0); + assert_eq!(reduce(u64::MAX / 17, 17), 0); + assert_eq!(reduce(1 + u64::MAX / 17, 17), 1); + assert_eq!(reduce(u64::MAX, 17), 16); +} + +/// Mapping to a power-of-two sized range is the same as taking the +/// high bits. +#[test] +fn test_reduce_power_of_two() { + assert_eq!(reduce(10 << 33, 1 << 32), 10 << 1); + assert_eq!(reduce(15 << 60, 1 << 8), 15 << 4); +} + +/// Construct two different hashers. We should get different values +/// for `mix`. +#[test] +fn test_mix() { + let h1 = MultiplicativeHash::new_keyed(b"h1"); + let h2 = MultiplicativeHash::new_keyed(b"h2"); + + assert!(h1 != h2); + + assert!(h1.mix(0) != h2.mix(0)); + assert!(h1.mix(1) != h2.mix(1)); + assert!(h1.mix(42) != h2.mix(42)); + assert!(h1.mix(u64::MAX) != h2.mix(u64::MAX)); +} + +/// Construct two random hashers. We should get different values +/// for `mix`. +#[test] +fn test_random_mix() { + let h1 = MultiplicativeHash::new_random(); + let h2 = MultiplicativeHash::new_random(); + + assert!(h1 != h2); + + assert!(h1.mix(0) != h2.mix(0)); + assert!(h1.mix(1) != h2.mix(1)); + assert!(h1.mix(42) != h2.mix(42)); + assert!(h1.mix(u64::MAX) != h2.mix(u64::MAX)); +} + +/// Construct two different hashers. We should get different +/// values for `map`. +#[test] +fn test_map() { + let h1 = MultiplicativeHash::new_keyed(b"h1"); + let h2 = MultiplicativeHash::new_keyed(b"h2"); + + assert!(h1 != h2); + + assert!(h1.map(0, 1024) != h2.map(0, 1024)); + assert!(h1.map(1, 1234) != h2.map(1, 1234)); + assert!(h1.map(42, 4567) != h2.map(42, 4567)); + assert!(h1.map(u64::MAX, 789) != h2.map(u64::MAX, 789)); +} + +/// Confirm that construction is const and deterministic. +#[test] +fn test_new_keyed() { + const H: MultiplicativeHash = MultiplicativeHash::new_keyed(b"asdfg"); + + // Given the nature of the hash function, two points suffice to + // derive the parameters. + + // addend = 7162733811001658625 + assert_eq!(H.mix(0), 7162733811001658625); + assert_eq!(H.addend, 7162733811001658625); + // multiplier = 14551484392748644090 - addend = 7388750581746985465 + assert_eq!(H.mix(1), 14551484392748644090); + assert_eq!(H.multiplier, 7388750581746985465); + + assert_eq!( + H, + MultiplicativeHash::new(7388750581746985465, 7162733811001658625) + ); + + // But it doesn't hurt to test a couple more points. + assert_eq!( + H.mix(42), + 42u64 + .wrapping_mul(7388750581746985465) + .wrapping_add(7162733811001658625) + ); + assert_eq!( + H.mix(u64::MAX), + u64::MAX + .wrapping_mul(7388750581746985465) + .wrapping_add(7162733811001658625) + ); +} diff --git a/src/sharded.rs b/src/sharded.rs index ff5f102..f46ef17 100644 --- a/src/sharded.rs +++ b/src/sharded.rs @@ -31,6 +31,7 @@ use std::sync::atomic::Ordering::Relaxed; use std::sync::Arc; use crate::cache_dir::CacheDir; +use crate::multiplicative_hash::MultiplicativeHash; use crate::trigger::PeriodicTrigger; use crate::Key; use crate::KISMET_TEMPORARY_SUBDIRECTORY as TEMP_SUBDIR; @@ -40,9 +41,14 @@ use crate::KISMET_TEMPORARY_SUBDIRECTORY as TEMP_SUBDIR; /// shard capacity inserts or updates. const MAINTENANCE_SCALE: usize = 2; -const RANDOM_MULTIPLIER: u64 = 0xf2efdf1111adba6f; +/// These mixers must be the same for all processes that access the +/// same sharded cache directory. That's why we derive the parameters +/// in a const function. +const PRIMARY_MIXER: MultiplicativeHash = + MultiplicativeHash::new_keyed(b"kismet: primary shard mixer"); -const SECONDARY_RANDOM_MULTIPLIER: u64 = 0xa55e1e02718a6a47; +const SECONDARY_MIXER: MultiplicativeHash = + MultiplicativeHash::new_keyed(b"kismet: secondary shard mixer"); /// A sharded cache is a hash-sharded directory of cache /// subdirectories. Each subdirectory is managed as an @@ -189,18 +195,12 @@ impl Cache { fn shard_ids(&self, key: Key) -> (usize, usize) { // We can't assume the hash is well distributed, so mix it // around a bit with a multiplicative hash. - let remap = |x: u64, mul: u64| { - let hash = x.wrapping_mul(mul) as u128; - // Map the hashed hash to a shard id with a fixed point - // multiplication. - ((self.num_shards as u128 * hash) >> 64) as usize - }; + let h1 = PRIMARY_MIXER.map(key.hash, self.num_shards); + let h2 = SECONDARY_MIXER.map(key.secondary_hash, self.num_shards); // We do not apply a 2-left strategy because our load // estimates can saturate. When that happens, we want to // revert to sharding based on `key.hash`. - let h1 = remap(key.hash, RANDOM_MULTIPLIER); - let h2 = remap(key.secondary_hash, SECONDARY_RANDOM_MULTIPLIER); (h1, self.other_shard_id(h1, h2)) }