Skip to content

Commit

Permalink
Port frequency sketch from Caffeine
Browse files Browse the repository at this point in the history
  • Loading branch information
tatsuya6502 committed Jan 19, 2021
1 parent c98d71b commit 42fb4c8
Show file tree
Hide file tree
Showing 5 changed files with 298 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
"Moka",
"Ristretto",
"Tatsuya",
"benmanes",
"clippy",
"cpus",
"deqs",
"getrandom",
"nanos",
"nocapture",
"rustfmt",
Expand Down
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ lazy_static = "1.4"
num_cpus = "1.13"
parking_lot = "0.10"
scheduled-thread-pool = "0.2"

[dev-dependencies]
getrandom = "0.2"
4 changes: 2 additions & 2 deletions src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ impl<K, V, S> Drop for Housekeeper<K, V, S> {
}
}

// functions/methods used by LFUCache
// functions/methods used by Cache
impl<K, V, S> Housekeeper<K, V, S>
where
K: Clone + Eq + Hash,
Expand Down Expand Up @@ -801,7 +801,7 @@ where
///
/// This struct exists with the sole purpose of avoiding compile
/// errors relevant to the thread pool usages. The thread pool
/// requires that the generic parameters on the `LFUCache` and `Inner`
/// requires that the generic parameters on the `Cache` and `Inner`
/// structs to have trait bounds `Send`, `Sync` and `'static`. This
/// will be unacceptable for many cache usages.
///
Expand Down
290 changes: 290 additions & 0 deletions src/frequency_sketch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
// License and Copyright Notice:
//
// Some of the code and doc comments in this module were ported or copied from
// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
//
// The original code/comments from Caffeine are licensed under the Apache License,
// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
//
// Copyrights of the original code/comments are retained by their contributors.
// For full authorship information, see the version control history of
// https://github.com/ben-manes/caffeine/

// This class maintains a 4-bit CountMinSketch [1] with periodic aging to
// provide the popularity history for the TinyLfu admission policy [2].
// The time and space efficiency of the sketch allows it to cheaply estimate the
// frequency of an entry in a stream of cache access events.
//
// The counter matrix is represented as a single dimensional array holding 16
// counters per slot. A fixed depth of four balances the accuracy and cost,
// resulting in a width of four times the length of the array. To retain an
// accurate estimation the array's length equals the maximum number of entries
// in the cache, increased to the closest power-of-two to exploit more efficient
// bit masking. This configuration results in a confidence of 93.75% and error
// bound of e / width.
//
// The frequency of all entries is aged periodically using a sampling window
// based on the maximum number of entries in the cache. This is referred to as
// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
// counters by two and subtracting based on the number of odd counters
// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
// and uses inexpensive bit manipulations per array location.
//
// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
// [2] TinyLFU: A Highly Efficient Cache Admission Policy
// https://dl.acm.org/citation.cfm?id=3149371

/// A probabilistic multi-set for estimating the popularity of an element within
/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
/// and an aging process periodically halves the popularity of all elements.
#[allow(dead_code)]
pub(crate) struct FrequencySketch {
// int sampleSize;
sample_size: usize,
// int tableMask;
table_mask: usize,
// long[] table;
table: Vec<u64>,
// int size;
size: usize,
}

// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
#[allow(dead_code)]
static SEED: [u64; 4] = [
0xc3a5c85c97cb3127,
0xb492b66fbe98f273,
0x9ae16a3b2f90404f,
0xcbf29ce484222325,
];

#[allow(dead_code)]
static RESET_MASK: u64 = 0x7777777777777777;

#[allow(dead_code)]
static ONE_MASK: u64 = 0x1111111111111111;

impl FrequencySketch {
/// Creates a frequency sketch with the capacity.
#[allow(dead_code)]
pub(crate) fn with_capacity(cap: usize) -> Self {
let maximum = cap.min((i32::MAX >> 1) as usize);
let table_size = if maximum == 0 {
1
} else {
maximum.next_power_of_two()
};
let table = vec![0; table_size];
let table_mask = 0.max(table_size - 1);
let sample_size = if cap == 0 {
10
} else if let Some(n) = maximum.checked_mul(10) {
n
} else {
i32::MAX as usize
};
Self {
sample_size,
table_mask,
table,
size: 0,
}
}

/// Takes the hash value of an element, and returns the estimated number of
/// occurrences of the element, up to the maximum (15).
#[allow(dead_code)]
pub(crate) fn frequency(&self, hash: u64) -> u8 {
let start = ((hash & 3) << 2) as u8;
let mut frequency = std::u8::MAX;
for i in 0..4 {
let index = self.index_of(hash, i);
let count = (self.table[index] >> ((start + i) << 2) & 0xF) as u8;
frequency = frequency.min(count);
}
frequency
}

/// Take a hash value of an element and increments the popularity of the
/// element if it does not exceed the maximum (15). The popularity of all
/// elements will be periodically down sampled when the observed events
/// exceeds a threshold. This process provides a frequency aging to allow
/// expired long term entries to fade away.
#[allow(dead_code)]
pub(crate) fn increment(&mut self, hash: u64) {
let start = ((hash & 3) << 2) as u8;
let mut added = false;
for i in 0..4 {
let index = self.index_of(hash, i);
added |= self.increment_at(index, start + i);
}

if added {
self.size += 1;
if self.size >= self.sample_size {
self.reset();
}
}
}

/// Takes a table index (each entry has 16 counters) and counter index, and
/// increments the counter by 1 if it is not already at the maximum value
/// (15). Returns `true` if incremented.
fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
let offset = (counter_index as usize) << 2;
let mask = 0xF_u64 << offset;
if self.table[table_index] & mask != mask {
self.table[table_index] += 1u64 << offset;
true
} else {
false
}
}

/// Reduces every counter by half of its original value.
fn reset(&mut self) {
let mut count = 0u32;
for entry in &mut self.table {
// Count number of odd numbers.
count += (*entry & ONE_MASK).count_ones();
*entry = (*entry >> 1) & RESET_MASK;
}
self.size = (self.size >> 1) - (count >> 2) as usize;
}

/// Returns the table index for the counter at the specified depth.
fn index_of(&self, hash: u64, depth: u8) -> usize {
let i = depth as usize;
let mut hash = (hash.wrapping_add(SEED[i])).wrapping_mul(SEED[i]);
hash += hash >> 32;
hash as usize & self.table_mask
}
}

// Some test cases were ported from Caffeine at:
// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
//
// To see the debug prints, run test as `cargo test -- --nocapture`
#[cfg(test)]
mod tests {
use super::FrequencySketch;
use std::hash::{BuildHasher, Hash, Hasher};

lazy_static::lazy_static! {
static ref ITEM: u32 = {
let mut buf = [0; 4];
getrandom::getrandom(&mut buf).unwrap();
unsafe { std::mem::transmute::<[u8; 4], u32>(buf) }
};
}

// This test was ported from Caffeine.
#[test]
fn increment_once() {
let mut sketch = FrequencySketch::with_capacity(512);
let hasher = hasher();
let item_hash = hasher(*ITEM);
sketch.increment(item_hash);
assert_eq!(sketch.frequency(item_hash), 1);
}

// This test was ported from Caffeine.
#[test]
fn increment_max() {
let mut sketch = FrequencySketch::with_capacity(512);
let hasher = hasher();
let item_hash = hasher(*ITEM);
for _ in 0..20 {
sketch.increment(item_hash);
}
assert_eq!(sketch.frequency(item_hash), 15);
}

// This test was ported from Caffeine.
#[test]
fn increment_distinct() {
let mut sketch = FrequencySketch::with_capacity(512);
let hasher = hasher();
sketch.increment(hasher(*ITEM));
sketch.increment(hasher(ITEM.wrapping_add(1)));
assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
}

// This test was ported from Caffeine.
#[test]
fn index_of_around_zero() {
let sketch = FrequencySketch::with_capacity(512);
let mut indexes = std::collections::HashSet::new();
let hashes = vec![std::u64::MAX, 0, 1];
for hash in hashes.iter() {
for depth in 0..4 {
indexes.insert(sketch.index_of(*hash, depth));
}
}
assert_eq!(indexes.len(), 4 * hashes.len())
}

// This test was ported from Caffeine.
#[test]
fn reset() {
let mut reset = false;
let mut sketch = FrequencySketch::with_capacity(64);
let hasher = hasher();

for i in 1..(20 * sketch.table.len()) {
sketch.increment(hasher(i));
if sketch.size != i {
reset = true;
break;
}
}

assert!(reset);
assert!(sketch.size <= sketch.sample_size / 2);
}

// This test was ported from Caffeine.
#[test]
fn heavy_hitters() {
let mut sketch = FrequencySketch::with_capacity(512);
let hasher = hasher();

for i in 100..100_000 {
sketch.increment(hasher(i));
}

for i in (0..10).step_by(2) {
for _ in 0..i {
sketch.increment(hasher(i));
}
}

// A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
let popularity = (0..10)
.map(|i| sketch.frequency(hasher(i)))
.collect::<Vec<_>>();

for (i, freq) in popularity.iter().enumerate() {
match i {
2 => assert!(freq <= &popularity[4]),
4 => assert!(freq <= &popularity[6]),
6 => assert!(freq <= &popularity[8]),
8 => (),
_ => assert!(freq <= &popularity[2]),
}
}
}

fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
let build_hasher = std::collections::hash_map::RandomState::default();
move |key| {
let mut hasher = build_hasher.build_hasher();
key.hash(&mut hasher);
hasher.finish()
}
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@

mod cache;
mod deque;
mod frequency_sketch;
mod segment;
mod thread_pool;

Expand Down

0 comments on commit 42fb4c8

Please sign in to comment.