From ad56c188426971db48037c2a8358a14ebf07a01a Mon Sep 17 00:00:00 2001 From: Jaya Kasa Date: Thu, 28 May 2026 08:58:15 -0400 Subject: [PATCH] Add Sampler and SampledList for heap profiling infrastructure Introduces two header-only components behind SNMALLOC_PROFILE, with no changes to existing allocation paths: - sampler.h: per-thread Poisson sampler. Fast path is one subtraction and branch per allocation. Slow path fires ~once per g_sample_interval bytes (default 512KB) and draws the next interval from a geometric distribution via xorshift64 + inverse-CDF. - sampled_list.h: SampledAlloc struct (raw PCs, size, weight) and a global doubly-linked SampledList. push() is lock-free (CAS on head); remove() and iterate() hold a std::mutex. Also defines the thread-local re-entrancy guard (g_in_sample_recording) that prevents recursive sampling when backtrace() or operator new is called from record_sample(). Tests cover sampling rate convergence, disabled-interval behaviour, large-allocation sampling probability, and concurrent push correctness. --- src/snmalloc/mem/sampled_list.h | 102 ++++++++++++++++ src/snmalloc/mem/sampler.h | 110 +++++++++++++++++ src/test/func/sampler/sampler.cc | 199 +++++++++++++++++++++++++++++++ 3 files changed, 411 insertions(+) create mode 100644 src/snmalloc/mem/sampled_list.h create mode 100644 src/snmalloc/mem/sampler.h create mode 100644 src/test/func/sampler/sampler.cc diff --git a/src/snmalloc/mem/sampled_list.h b/src/snmalloc/mem/sampled_list.h new file mode 100644 index 000000000..33edcca04 --- /dev/null +++ b/src/snmalloc/mem/sampled_list.h @@ -0,0 +1,102 @@ +#pragma once + +#ifdef SNMALLOC_PROFILE + +# include +# include +# include +# include + +# include "snmalloc/stl/atomic.h" + +namespace snmalloc +{ + static constexpr uint32_t kMaxStackDepth = 64; + + /** + * One live sampled allocation. + * + * Raw program-counter addresses are stored here; symbolication is deferred + * to profile-dump time so there is no per-allocation symbol-lookup cost. + */ + struct SampledAlloc + { + void* ptr{nullptr}; + size_t requested_size{0}; + size_t allocated_size{0}; + size_t weight{0}; + uint32_t depth{0}; + void* stack[kMaxStackDepth]{}; + + SampledAlloc* next{nullptr}; + SampledAlloc* prev{nullptr}; + }; + + /** + * Global doubly-linked list of live sampled allocations. + * + * push() is lock-free (CAS on head) so it does not block the alloc path. + * remove() and iterate() acquire a mutex; they are off the hot path. + */ + class SampledList + { + stl::Atomic head_{nullptr}; + mutable std::mutex mutex_; + + public: + // Called from the alloc slow path — lock-free. + void push(SampledAlloc* node) noexcept + { + SampledAlloc* old = head_.load(stl::memory_order_relaxed); + do + { + node->next = old; + node->prev = nullptr; + } while (!head_.compare_exchange_weak( + old, node, stl::memory_order_release, stl::memory_order_relaxed)); + + // Safe: node is visible to other threads via head_ before we touch + // old->prev, and remove() holds the mutex before reading prev. + if (old) + { + std::lock_guard lock(mutex_); + old->prev = node; + } + } + + // Called from the dealloc path — takes mutex. + void remove(SampledAlloc* node) noexcept + { + std::lock_guard lock(mutex_); + if (node->prev) + node->prev->next = node->next; + else + head_.store(node->next, stl::memory_order_relaxed); + if (node->next) + node->next->prev = node->prev; + } + + // Snapshot iteration — takes mutex. + template + void iterate(Fn&& fn) const + { + std::lock_guard lock(mutex_); + for (SampledAlloc* n = head_.load(stl::memory_order_relaxed); n; + n = n->next) + fn(*n); + } + }; + + // One global list — all sampled allocations across all threads. + inline SampledList g_sampled_list; + + /** + * Re-entrancy guard: suppresses sampling for any allocation made + * while we are already inside record_sample() (e.g. via backtrace() + * calling into the allocator). + */ + inline thread_local bool g_in_sample_recording{false}; + +} // namespace snmalloc + +#endif // SNMALLOC_PROFILE diff --git a/src/snmalloc/mem/sampler.h b/src/snmalloc/mem/sampler.h new file mode 100644 index 000000000..0149e4ba7 --- /dev/null +++ b/src/snmalloc/mem/sampler.h @@ -0,0 +1,110 @@ +#pragma once + +#ifdef SNMALLOC_PROFILE + +# include "snmalloc/ds_core/defines.h" +# include "snmalloc/stl/atomic.h" + +# include +# include +# include +# include + +namespace snmalloc +{ + /** + * Sampling interval in bytes. record() fires on average once per interval. + * Set to 0 to disable sampling entirely. + */ + inline stl::Atomic g_sample_interval{512 * 1024}; + + /** + * Per-thread Poisson sampler for heap profiling. + * + * Models the allocation stream as a byte sequence with each byte + * independently marked with probability 1/interval. An allocation + * is sampled iff it contains at least one marked byte, giving: + * + * P(sample) = 1 - e^(-size/interval) + * + * Fast path: one subtraction and branch per allocation. + * Slow path: geometric sample + PRNG step, taken ~once per interval bytes. + * + * Not thread-safe — one Sampler per allocator (which is per-thread). + */ + class Sampler + { + // Counts down bytes until next sample. Goes negative when a sample fires. + ssize_t bytes_until_sample_{0}; + + // xorshift64 PRNG. Zero means uninitialised. + uint64_t rng_{0}; + + public: + /** + * Account for an allocation of `size` bytes. + * Returns 0 if not sampled, or a positive weight if this allocation + * should be recorded. The weight is an estimate of how many bytes + * this sample statistically represents. + */ + SNMALLOC_FAST_PATH size_t record(size_t size) noexcept + { + bytes_until_sample_ -= static_cast(size); + if (SNMALLOC_LIKELY(bytes_until_sample_ > 0)) + return 0; + return record_slow(size); + } + + private: + SNMALLOC_SLOW_PATH size_t record_slow(size_t size) noexcept + { + size_t interval = g_sample_interval.load(stl::memory_order_relaxed); + + if (interval == 0) + { + // Sampling disabled — park counter far in the future. + bytes_until_sample_ = SSIZE_MAX / 2; + return 0; + } + + if (SNMALLOC_UNLIKELY(rng_ == 0)) + { + // First use: seed the PRNG from the allocator's address, then + // pick an initial sample point and re-apply the current allocation. + rng_ = (reinterpret_cast(this) ^ 0xdeadbeef01234567ULL) | 1; + bytes_until_sample_ = geometric(interval); + bytes_until_sample_ -= static_cast(size); + if (bytes_until_sample_ > 0) + return 0; + // Fell through: this allocation also triggers a sample. + } + + // Weight: bytes this sample represents ≈ interval. + // (Exact: interval minus the overshoot, but interval is the + // unbiased estimator and sufficient for Phase 2.) + size_t weight = interval; + bytes_until_sample_ = geometric(interval); + return weight; + } + + // Geometric random variable with the given mean, via inverse CDF: + // gap = -log(U) * mean, U ~ Uniform(0, 1] + ssize_t geometric(size_t mean) noexcept + { + // xorshift64 + rng_ ^= rng_ << 13; + rng_ ^= rng_ >> 7; + rng_ ^= rng_ << 17; + + // Map top 53 bits to (0, 1]. + double u = static_cast(rng_ >> 11) / double(uint64_t(1) << 53); + if (SNMALLOC_UNLIKELY(u == 0.0)) + u = 1e-300; + + return static_cast(-std::log(u) * static_cast(mean)); + } + }; + +} // namespace snmalloc + +#endif // SNMALLOC_PROFILE diff --git a/src/test/func/sampler/sampler.cc b/src/test/func/sampler/sampler.cc new file mode 100644 index 000000000..410f323a1 --- /dev/null +++ b/src/test/func/sampler/sampler.cc @@ -0,0 +1,199 @@ +// Tests for Sampler and SampledList (Phase 2 of heap profiling). +// SNMALLOC_PROFILE is defined here so these classes compile standalone, +// independent of the build-level flag. +#ifndef SNMALLOC_PROFILE +# define SNMALLOC_PROFILE 1 +#endif + +#include "snmalloc/mem/sampled_list.h" +#include "snmalloc/mem/sampler.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace snmalloc; + +// --------------------------------------------------------------------------- +// Sampler tests +// --------------------------------------------------------------------------- + +// Verify that the sampled fraction converges to 1 - e^(-size/interval). +static void test_sampler_rate() +{ + std::cout << "test_sampler_rate\n"; + + Sampler s; + g_sample_interval.store(512, stl::memory_order_relaxed); + + const size_t alloc_size = 64; + const size_t n = 100'000; + size_t hits = 0; + + for (size_t i = 0; i < n; i++) + if (s.record(alloc_size) != 0) + hits++; + + double expected_p = 1.0 - std::exp(-static_cast(alloc_size) / 512.0); + double actual_p = static_cast(hits) / static_cast(n); + + // Allow 5% absolute tolerance — statistical, not a tight bound. + double error = std::abs(actual_p - expected_p); + if (error > 0.05) + { + std::cerr << "sampler rate out of range: expected ~" << expected_p + << " got " << actual_p << "\n"; + abort(); + } +} + +// With interval=0, record() must never fire. +static void test_sampler_disabled() +{ + std::cout << "test_sampler_disabled\n"; + + Sampler s; + g_sample_interval.store(0, stl::memory_order_relaxed); + + for (size_t i = 0; i < 100'000; i++) + { + if (s.record(64) != 0) + { + std::cerr << "sampler fired with interval=0\n"; + abort(); + } + } +} + +// Large allocation (size >> interval) should almost always be sampled. +static void test_sampler_large_alloc() +{ + std::cout << "test_sampler_large_alloc\n"; + + Sampler s; + g_sample_interval.store(512, stl::memory_order_relaxed); + + const size_t large = 64 * 1024; + const size_t n = 1'000; + size_t hits = 0; + + for (size_t i = 0; i < n; i++) + if (s.record(large) != 0) + hits++; + + // P = 1 - e^(-64K/512) ≈ 1.0 — expect at least 95% sampled. + if (hits < n * 95 / 100) + { + std::cerr << "large alloc under-sampled: " << hits << "/" << n << "\n"; + abort(); + } +} + +// --------------------------------------------------------------------------- +// SampledList tests +// --------------------------------------------------------------------------- + +static void test_list_push_remove() +{ + std::cout << "test_list_push_remove\n"; + + SampledList list; + SampledAlloc a, b, c; + a.ptr = &a; b.ptr = &b; c.ptr = &c; + + list.push(&a); + list.push(&b); + list.push(&c); + + size_t count = 0; + list.iterate([&](const SampledAlloc&) { count++; }); + assert(count == 3); + + list.remove(&b); + count = 0; + list.iterate([&](const SampledAlloc&) { count++; }); + assert(count == 2); + + list.remove(&a); + list.remove(&c); + count = 0; + list.iterate([&](const SampledAlloc&) { count++; }); + assert(count == 0); +} + +static void test_list_remove_head() +{ + std::cout << "test_list_remove_head\n"; + + SampledList list; + SampledAlloc a, b; + + list.push(&a); + list.push(&b); // b becomes head (LIFO) + list.remove(&b); + + size_t count = 0; + void* found = nullptr; + list.iterate([&](const SampledAlloc& n) { + count++; + found = n.ptr; + }); + assert(count == 1); + assert(found == a.ptr); +} + +// Multiple threads push concurrently; all nodes must appear in iteration. +static void test_list_concurrent_push() +{ + std::cout << "test_list_concurrent_push\n"; + + SampledList list; + const size_t n_threads = 8; + const size_t n_per_thread = 128; + + std::vector> nodes(n_threads, + std::vector(n_per_thread)); + std::vector threads; + + for (size_t t = 0; t < n_threads; t++) + { + threads.emplace_back([&, t]() { + for (size_t i = 0; i < n_per_thread; i++) + { + nodes[t][i].ptr = &nodes[t][i]; + list.push(&nodes[t][i]); + } + }); + } + for (auto& th : threads) + th.join(); + + size_t count = 0; + list.iterate([&](const SampledAlloc&) { count++; }); + + if (count != n_threads * n_per_thread) + { + std::cerr << "concurrent push: expected " << n_threads * n_per_thread + << " nodes, got " << count << "\n"; + abort(); + } +} + +// --------------------------------------------------------------------------- + +int main() +{ + test_sampler_rate(); + test_sampler_disabled(); + test_sampler_large_alloc(); + test_list_push_remove(); + test_list_remove_head(); + test_list_concurrent_push(); + + std::cout << "all sampler tests passed\n"; + return 0; +}