diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index 227535e33..269a81d23 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -149,3 +149,7 @@ harness = false [[bench]] name = "reorder_buffer_scaling" harness = false + +[[bench]] +name = "reorder_buffer_cache" +harness = false diff --git a/crates/engine/benches/reorder_buffer_cache.rs b/crates/engine/benches/reorder_buffer_cache.rs new file mode 100644 index 000000000..135255e67 --- /dev/null +++ b/crates/engine/benches/reorder_buffer_cache.rs @@ -0,0 +1,273 @@ +//! Cache-behavior benchmark for the `concurrent_delta::ReorderBuffer`. +//! +//! Purpose: provide evidence on whether the current `ReorderBuffer` +//! storage layout (`Box<[Option]>` ring buffer) suffers cache-unfriendly +//! access patterns at 1M parallel results. The actual `perf stat` / +//! `cachegrind` runs are operator-driven and not part of CI. +//! +//! What is measured: +//! - Insert 1M out-of-order indexed items, then drain them in order. +//! - Item payload size is varied across {32 B, 256 B, 4 KB} so the +//! slot footprint sweeps from "comfortably L1/L2-resident" all the +//! way past the LLC for the 1M-element working set. +//! - Insertion order patterns: {fully reverse, random shuffle, +//! near-in-order with 10% deltas}, mirroring the realistic mix of +//! completion orders that rayon workers produce. +//! +//! All payloads are pre-allocated outside the timed section so the +//! benchmark measures `ReorderBuffer` operations, not allocator work. +//! +//! Throughput is reported via `Throughput::Elements(1_000_000)` so +//! `cargo bench` output prints ops/sec for each (payload, pattern) +//! cell. +//! +//! # Running +//! +//! The 1M cells are expensive (5-30 s per iteration), so the bench is +//! gated behind the `BENCH_REORDER_CACHE=1` environment variable. The +//! default `cargo bench` run is a no-op: +//! +//! ```sh +//! # default run: skips entirely +//! cargo bench -p engine --bench reorder_buffer_cache +//! +//! # full 1M cache-behavior sweep (opt-in) +//! BENCH_REORDER_CACHE=1 cargo bench -p engine --bench reorder_buffer_cache +//! ``` +//! +//! ## perf stat (Linux) +//! +//! Use `perf stat` to capture cache-miss/cache-reference counters +//! around the timed section. Build the bench binary, then drive it +//! directly with `--profile-time` so the wall-clock window matches +//! perf's sampling window: +//! +//! ```sh +//! BENCH_REORDER_CACHE=1 cargo bench -p engine --bench reorder_buffer_cache --no-run +//! BIN=$(ls -t target/release/deps/reorder_buffer_cache-* | head -n1) +//! perf stat -e cache-misses,cache-references,L1-dcache-load-misses,LLC-load-misses \ +//! -- BENCH_REORDER_CACHE=1 "$BIN" --bench --profile-time 10 +//! ``` +//! +//! ## cachegrind (any platform via valgrind) +//! +//! Cachegrind is platform-agnostic but ~50x slower than native; expect +//! the 1M cells to take minutes. Drive a single cell at a time using +//! Criterion's filter: +//! +//! ```sh +//! BENCH_REORDER_CACHE=1 cargo bench -p engine --bench reorder_buffer_cache --no-run +//! BIN=$(ls -t target/release/deps/reorder_buffer_cache-* | head -n1) +//! valgrind --tool=cachegrind \ +//! BENCH_REORDER_CACHE=1 "$BIN" --bench reorder_cache_1m/payload256/shuffle +//! cg_annotate cachegrind.out. +//! ``` +//! +//! # How to interpret the numbers +//! +//! The bench provides evidence for one of two design directions: +//! +//! - **Favorable (cache-friendly): high L1 hit rate, low LLC miss rate, +//! throughput stays flat across patterns.** The ring buffer's slot +//! layout works at scale. No layout change is justified - the +//! existing `Box<[Option]>` stays. Optimisation effort should move +//! to other hot spots in the consumer (e.g., spill). +//! - **Unfavorable (cache-hostile): cache-miss rate scales with payload +//! size, random shuffle is much slower than near-in-order, LLC misses +//! dominate insert/drain.** That justifies a layout change. The two +//! credible options: +//! 1. Replace the `Box<[Option]>` with a flat `Vec` plus a +//! bitmap of occupancy. Removes the per-slot enum tag/padding +//! and packs more items into a cache line. +//! 2. Split storage into hot (sequence-index) and cold (payload) +//! arenas so drain-time scans only touch the index. +//! +//! For a 1M-file transfer, even a 2-3x reduction in LLC traffic should +//! be visible end-to-end via the existing throughput benches. + +#![deny(unsafe_code)] + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use engine::concurrent_delta::ReorderBuffer; +use std::hint::black_box; + +const ITEM_COUNT: usize = 1_000_000; +const PAYLOAD_SIZES: &[usize] = &[32, 256, 4096]; + +/// Pattern describing how worker completion order maps to insertion +/// order into the reorder buffer. +#[derive(Copy, Clone)] +enum Pattern { + /// Fully reverse: worker N-1 finishes first, worker 0 finishes last. + /// Maximises the time items spend buffered and the working-set size. + Reverse, + /// Random shuffle over the entire range. Approximates a worst-case + /// access pattern where no temporal locality exists. + Shuffle, + /// Near-in-order: most items arrive close to `next_expected`, with + /// 10% reordered into local windows of up to 32 slots. Closest to + /// the production rayon worker pattern. + NearInOrder, +} + +impl Pattern { + const fn tag(self) -> &'static str { + match self { + Pattern::Reverse => "reverse", + Pattern::Shuffle => "shuffle", + Pattern::NearInOrder => "near_in_order", + } + } +} + +/// Builds the insertion order for the given pattern. +/// +/// Returned `Vec` contains a permutation of `0..count` describing +/// the order in which items are handed to the reorder buffer. +fn build_order(count: usize, pattern: Pattern) -> Vec { + match pattern { + Pattern::Reverse => (0..count as u64).rev().collect(), + Pattern::Shuffle => { + // Fisher-Yates with a deterministic LCG so runs are + // reproducible across machines. + let mut seq: Vec = (0..count as u64).collect(); + let mut state: u64 = 0x5DEE_CE66_D00B_5EE5; + for i in (1..seq.len()).rev() { + state = state + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1_442_695_040_888_963_407); + let j = ((state >> 33) as usize) % (i + 1); + seq.swap(i, j); + } + seq + } + Pattern::NearInOrder => { + // Start in-order, then reorder 10% of indices within a + // 32-slot local window. Matches the typical rayon + // completion drift. + let mut seq: Vec = (0..count as u64).collect(); + let mut state: u64 = 0x00C0_FFEE_DEAD_BEEF; + let window = 32usize; + let target = count / 10; + for _ in 0..target { + state = state + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1_442_695_040_888_963_407); + let i = ((state >> 33) as usize) % count; + state = state + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1_442_695_040_888_963_407); + let span = ((state >> 33) as usize) % window; + let j = (i + span).min(count - 1); + seq.swap(i, j); + } + seq + } + } +} + +/// Pre-allocates `count` payloads of `payload_size` bytes. Each +/// payload's first 8 bytes encode its sequence number so the drain +/// step can verify ordering cheaply. +fn build_payloads(count: usize, payload_size: usize) -> Vec> { + (0..count) + .map(|i| { + let mut v = vec![0u8; payload_size]; + let bytes = (i as u64).to_le_bytes(); + let n = bytes.len().min(payload_size); + v[..n].copy_from_slice(&bytes[..n]); + v + }) + .collect() +} + +/// Runs the insert-then-drain workload against a single +/// `ReorderBuffer`. Returns a checksum over the drained payloads so +/// the optimiser cannot elide the work. +/// +/// Pre-condition: `capacity` is sized so every insert in `order` +/// succeeds without growing the ring (see `capacity_for`). Falling +/// back to `force_insert` would distort the cache-behavior numbers +/// because resize allocates a fresh backing buffer. +fn run_workload(order: &[u64], payloads: Vec>, capacity: usize) -> u64 { + let mut buf: ReorderBuffer> = ReorderBuffer::new(capacity); + // Move payloads out by sequence index. Storing them in an Option + // vec keeps the per-iteration take() O(1). + let mut slots: Vec>> = payloads.into_iter().map(Some).collect(); + + let mut checksum: u64 = 0; + for &seq in order { + let payload = slots[seq as usize] + .take() + .expect("each sequence is consumed exactly once"); + buf.insert(seq, payload) + .expect("capacity_for guarantees the insert fits"); + for v in buf.drain_ready() { + checksum = checksum.wrapping_add(v.len() as u64); + } + } + for v in buf.drain_ready() { + checksum = checksum.wrapping_add(v.len() as u64); + } + checksum +} + +/// Capacity sized to cover the worst-case gap each pattern can +/// produce, so the timed section measures the steady-state ring +/// behaviour rather than `grow()` work. +fn capacity_for(pattern: Pattern, count: usize) -> usize { + match pattern { + // Reverse fills the ring before any drain. Cap at count so + // every item fits without `force_insert`. + Pattern::Reverse => count, + // Shuffle's expected max gap is ~count; cap at count to stay + // on the O(1) fast path. + Pattern::Shuffle => count, + // Near-in-order tops out at the 32-slot window, but we leave + // headroom for the LCG occasionally swapping across boundaries. + Pattern::NearInOrder => 4096, + } +} + +fn bench_cell(c: &mut Criterion, payload_size: usize, pattern: Pattern) { + let group_name = format!("reorder_cache_1m/payload{payload_size}/{}", pattern.tag()); + let mut group = c.benchmark_group(&group_name); + group.throughput(Throughput::Elements(ITEM_COUNT as u64)); + // 1M items per iteration is heavy; minimum sample size keeps + // wall-clock manageable while still smoothing per-run jitter. + group.sample_size(10); + + let order = build_order(ITEM_COUNT, pattern); + let capacity = capacity_for(pattern, ITEM_COUNT); + + group.bench_with_input( + BenchmarkId::from_parameter(format!("cap{capacity}")), + &order, + |b, order| { + b.iter_batched( + || build_payloads(ITEM_COUNT, payload_size), + |payloads| black_box(run_workload(black_box(order), payloads, capacity)), + criterion::BatchSize::PerIteration, + ); + }, + ); + + group.finish(); +} + +/// Entry point for the 1M cache-behavior sweep. Skips entirely unless +/// `BENCH_REORDER_CACHE=1` is set so a default `cargo bench` run does +/// not spend minutes on this bench. +fn bench_reorder_cache(c: &mut Criterion) { + if std::env::var("BENCH_REORDER_CACHE").is_err() { + return; + } + for &payload_size in PAYLOAD_SIZES { + for &pattern in &[Pattern::Reverse, Pattern::Shuffle, Pattern::NearInOrder] { + bench_cell(c, payload_size, pattern); + } + } +} + +criterion_group!(benches, bench_reorder_cache); +criterion_main!(benches);