Skip to content

Commit

Permalink
multiplicative_hash: new module for const-fn mix + remap
Browse files Browse the repository at this point in the history
Sharded caches need a deterministic function to mix potentially dodgy
hash values and map them to a range.  Steal the logic (and improve it
by fully implementing Dietzfelbinger's multiplicative hash), and make
it possible to initialise the parameters with a compile-time sha256,
instead of manually inputting pseudorandom values.

TESTED=new and existing tests.
  • Loading branch information
pkhuong committed Sep 23, 2021
1 parent 078b3aa commit 51eb3d0
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 10 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ edition = "2018"
license = "MIT"

[dependencies]
extendhash = "1"
filetime = "0.2"
rand = "0.8"
tempfile = "3"
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@
//! files or directories that start with a `.`, as long as they do not
//! collide with the `.kismet` prefix.
mod cache_dir;
mod multiplicative_hash;
pub mod plain;
pub mod raw_cache;
mod readonly;
Expand Down
175 changes: 175 additions & 0 deletions src/multiplicative_hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/// Multiplicative hash structs implement
/// [Dietzfelbinger's universal multiplicative hash function](https://link.springer.com/chapter/10.1007/978-3-319-98355-4_15)
/// with `const fn` keyed constructors, and pair that with a range
/// reduction function from `u64` to a `usize` range that extends
/// Dietzfelbinger's power-of-two scheme.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub(crate) struct MultiplicativeHash {
// Pseudorandom odd multiplier
multiplier: u64,
// Pseudorandom value added to the product
addend: u64,
}

/// Maps vaues in `[0, u64::MAX]` to `[0, domain)` linearly.
///
/// As a special case, this function returns 0 instead of erroring out
/// when `domain == 0`.
#[inline(always)]
const fn reduce(x: u64, domain: usize) -> usize {
((domain as u128 * x as u128) >> 64) as usize
}

impl MultiplicativeHash {
/// Constructs a `MultiplicativeHash` with the arguments as parameters.
/// The multiplier is converted to an odd integer if necessary.
pub const fn new(multiplier: u64, addend: u64) -> MultiplicativeHash {
MultiplicativeHash {
multiplier: multiplier | 1,
addend,
}
}

/// Deterministically constructs a `MultiplicativeHash` with
/// parameters derived from the `key`, wih a SHA-256 hash.
pub const fn new_keyed(key: &[u8]) -> MultiplicativeHash {
use extendhash::sha256;

let hash = sha256::compute_hash(key);
let multiplier = [
hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7],
];
let addend = [
hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15],
];

MultiplicativeHash::new(u64::from_le_bytes(multiplier), u64::from_le_bytes(addend))
}

/// Constructs a new pseudorandom `MultiplicativeHash`.
pub fn new_random() -> MultiplicativeHash {
use rand::Rng;

let mut rnd = rand::thread_rng();
MultiplicativeHash::new(rnd.gen(), rnd.gen())
}

/// Mixes `value` with this hash's parameters. If you must
/// truncate the result, use its high bits.
#[inline(always)]
pub const fn mix(&self, value: u64) -> u64 {
value
.wrapping_mul(self.multiplier)
.wrapping_add(self.addend)
}

/// Mixes `value` and maps the result to a usize less than range.
///
/// If `range == 0`, always returns 0.
#[inline(always)]
pub const fn map(&self, value: u64, range: usize) -> usize {
reduce(self.mix(value), range)
}
}

/// Smoke test the `reduce` function.
#[test]
fn test_reduce() {
// Mapping to an empty range should always return 0.
assert_eq!(reduce(0, 0), 0);
assert_eq!(reduce(u64::MAX, 0), 0);

// Smoke test the range reduction
assert_eq!(reduce(0, 17), 0);
assert_eq!(reduce(u64::MAX / 17, 17), 0);
assert_eq!(reduce(1 + u64::MAX / 17, 17), 1);
assert_eq!(reduce(u64::MAX, 17), 16);
}

/// Mapping to a power-of-two sized range is the same as taking the
/// high bits.
#[test]
fn test_reduce_power_of_two() {
assert_eq!(reduce(10 << 33, 1 << 32), 10 << 1);
assert_eq!(reduce(15 << 60, 1 << 8), 15 << 4);
}

/// Construct two different hashers. We should get different values
/// for `mix`.
#[test]
fn test_mix() {
let h1 = MultiplicativeHash::new_keyed(b"h1");
let h2 = MultiplicativeHash::new_keyed(b"h2");

assert!(h1 != h2);

assert!(h1.mix(0) != h2.mix(0));
assert!(h1.mix(1) != h2.mix(1));
assert!(h1.mix(42) != h2.mix(42));
assert!(h1.mix(u64::MAX) != h2.mix(u64::MAX));
}

/// Construct two random hashers. We should get different values
/// for `mix`.
#[test]
fn test_random_mix() {
let h1 = MultiplicativeHash::new_random();
let h2 = MultiplicativeHash::new_random();

assert!(h1 != h2);

assert!(h1.mix(0) != h2.mix(0));
assert!(h1.mix(1) != h2.mix(1));
assert!(h1.mix(42) != h2.mix(42));
assert!(h1.mix(u64::MAX) != h2.mix(u64::MAX));
}

/// Construct two different hashers. We should get different
/// values for `map`.
#[test]
fn test_map() {
let h1 = MultiplicativeHash::new_keyed(b"h1");
let h2 = MultiplicativeHash::new_keyed(b"h2");

assert!(h1 != h2);

assert!(h1.map(0, 1024) != h2.map(0, 1024));
assert!(h1.map(1, 1234) != h2.map(1, 1234));
assert!(h1.map(42, 4567) != h2.map(42, 4567));
assert!(h1.map(u64::MAX, 789) != h2.map(u64::MAX, 789));
}

/// Confirm that construction is const and deterministic.
#[test]
fn test_new_keyed() {
const H: MultiplicativeHash = MultiplicativeHash::new_keyed(b"asdfg");

// Given the nature of the hash function, two points suffice to
// derive the parameters.

// addend = 7162733811001658625
assert_eq!(H.mix(0), 7162733811001658625);
assert_eq!(H.addend, 7162733811001658625);
// multiplier = 14551484392748644090 - addend = 7388750581746985465
assert_eq!(H.mix(1), 14551484392748644090);
assert_eq!(H.multiplier, 7388750581746985465);

assert_eq!(
H,
MultiplicativeHash::new(7388750581746985465, 7162733811001658625)
);

// But it doesn't hurt to test a couple more points.
assert_eq!(
H.mix(42),
42u64
.wrapping_mul(7388750581746985465)
.wrapping_add(7162733811001658625)
);
assert_eq!(
H.mix(u64::MAX),
u64::MAX
.wrapping_mul(7388750581746985465)
.wrapping_add(7162733811001658625)
);
}
20 changes: 10 additions & 10 deletions src/sharded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use std::sync::atomic::Ordering::Relaxed;
use std::sync::Arc;

use crate::cache_dir::CacheDir;
use crate::multiplicative_hash::MultiplicativeHash;
use crate::trigger::PeriodicTrigger;
use crate::Key;
use crate::KISMET_TEMPORARY_SUBDIRECTORY as TEMP_SUBDIR;
Expand All @@ -40,9 +41,14 @@ use crate::KISMET_TEMPORARY_SUBDIRECTORY as TEMP_SUBDIR;
/// shard capacity inserts or updates.
const MAINTENANCE_SCALE: usize = 2;

const RANDOM_MULTIPLIER: u64 = 0xf2efdf1111adba6f;
/// These mixers must be the same for all processes that access the
/// same sharded cache directory. That's why we derive the parameters
/// in a const function.
const PRIMARY_MIXER: MultiplicativeHash =
MultiplicativeHash::new_keyed(b"kismet: primary shard mixer");

const SECONDARY_RANDOM_MULTIPLIER: u64 = 0xa55e1e02718a6a47;
const SECONDARY_MIXER: MultiplicativeHash =
MultiplicativeHash::new_keyed(b"kismet: secondary shard mixer");

/// A sharded cache is a hash-sharded directory of cache
/// subdirectories. Each subdirectory is managed as an
Expand Down Expand Up @@ -189,18 +195,12 @@ impl Cache {
fn shard_ids(&self, key: Key) -> (usize, usize) {
// We can't assume the hash is well distributed, so mix it
// around a bit with a multiplicative hash.
let remap = |x: u64, mul: u64| {
let hash = x.wrapping_mul(mul) as u128;
// Map the hashed hash to a shard id with a fixed point
// multiplication.
((self.num_shards as u128 * hash) >> 64) as usize
};
let h1 = PRIMARY_MIXER.map(key.hash, self.num_shards);
let h2 = SECONDARY_MIXER.map(key.secondary_hash, self.num_shards);

// We do not apply a 2-left strategy because our load
// estimates can saturate. When that happens, we want to
// revert to sharding based on `key.hash`.
let h1 = remap(key.hash, RANDOM_MULTIPLIER);
let h2 = remap(key.secondary_hash, SECONDARY_RANDOM_MULTIPLIER);
(h1, self.other_shard_id(h1, h2))
}

Expand Down

0 comments on commit 51eb3d0

Please sign in to comment.