Skip to content

Commit 08ff017

Browse files
[libc] Improve GPU benchmarking (#153512)
This patch improves the GPU benchmarking in this way: * Replace `rand`/`srand` with a deterministic per-thread RNG seeded by `call_index`: reproducible, apples-to-apples libc vs vendor comparisons. * Fix input generation: sample the unbiased exponent uniformly in `[min_exp, max_exp]`, clamp bounds, and skip `Inf`, `NaN`, `-0.0`, and `+0.0`. * Fix standard deviation: use an explicit estimator from sums and sums-of-squares (`sqrt(E[x^2] − E[x]^2)`) across samples. * Fix throughput overhead: subtract a loop-only baseline inside NVPTX/AMDGPU timing backends so `benchmark()` gets cycles-per-call already corrected (no `overhead()` call). * Adapt existing math benchmarks to the new RNG/timing plumbing (plumb `call_index`, drop `rand/srand`, clean includes). * Correct inter-thread aggregation: use iteration-weighted pooling to compute the global mean/variance, ensuring statistically sound `Cycles (Mean)` and `Stddev`. * Remove `Time / Iteration` column from the results table: it reported per-thread convergence time (not per-call latency) and was redundant/misleading next to `Cycles (Mean)`. * Remove unused `BenchmarkLogger` files: dead code that added maintenance and cognitive overhead without providing functionality. --- ## TODO (before merge) * [ ] Investigate compiler warnings and address their root causes. * [x] Review how per-thread results are aggregated into the overall result. ## Follow-ups (future PRs) * Add support to run throughput benchmarks with uniform (linear) input distributions, alongside the current log2-uniform scheme. * Review/adjust the configuration and coverage of existing math benchmarks. * Add more math benchmarks (e.g., `exp`/`expf`, others).
1 parent f34326d commit 08ff017

File tree

10 files changed

+496
-227
lines changed

10 files changed

+496
-227
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ function(add_benchmark benchmark_name)
2222
${BENCHMARK_LINK_LIBRARIES}
2323
DEPENDS
2424
libc.src.stdio.printf
25-
libc.src.stdlib.srand
26-
libc.src.stdlib.rand
2725
${BENCHMARK_DEPENDS}
2826
${BENCHMARK_UNPARSED_ARGUMENTS}
2927
COMPILE_OPTIONS
@@ -51,7 +49,6 @@ add_unittest_framework_library(
5149
libc.src.__support.CPP.string
5250
libc.src.__support.CPP.string_view
5351
libc.src.__support.CPP.type_traits
54-
libc.src.__support.CPP.functional
5552
libc.src.__support.CPP.limits
5653
libc.src.__support.CPP.algorithm
5754
libc.src.__support.CPP.atomic
@@ -64,8 +61,6 @@ add_unittest_framework_library(
6461
libc.src.__support.FPUtil.sqrt
6562
libc.src.__support.fixedvector
6663
libc.src.time.clock
67-
libc.src.stdlib.rand
68-
libc.src.stdlib.srand
6964
libc.benchmarks.gpu.timing.timing
7065
libc.src.stdio.printf
7166
)
Lines changed: 99 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
#include "LibcGpuBenchmark.h"
2+
3+
#include "hdr/stdint_proxy.h"
24
#include "src/__support/CPP/algorithm.h"
35
#include "src/__support/CPP/array.h"
46
#include "src/__support/CPP/atomic.h"
57
#include "src/__support/CPP/string.h"
8+
#include "src/__support/FPUtil/FPBits.h"
69
#include "src/__support/FPUtil/sqrt.h"
710
#include "src/__support/GPU/utils.h"
811
#include "src/__support/fixedvector.h"
912
#include "src/__support/macros/config.h"
1013
#include "src/__support/time/gpu/time_utils.h"
1114
#include "src/stdio/printf.h"
12-
#include "src/stdlib/srand.h"
1315

1416
namespace LIBC_NAMESPACE_DECL {
1517
namespace benchmarks {
@@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
2022
benchmarks.push_back(benchmark);
2123
}
2224

25+
static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
26+
double value) {
27+
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
28+
29+
uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
30+
31+
while (true) {
32+
double current_value = FPBits(expected_bits).get_val();
33+
double next_value = current_value + value;
34+
35+
uint64_t desired_bits = FPBits(next_value).uintval();
36+
if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
37+
cpp::MemoryOrder::ACQUIRE,
38+
cpp::MemoryOrder::RELAXED))
39+
break;
40+
}
41+
}
42+
2343
struct AtomicBenchmarkSums {
24-
cpp::Atomic<uint64_t> cycles_sum = 0;
25-
cpp::Atomic<uint64_t> standard_deviation_sum = 0;
44+
cpp::Atomic<uint32_t> active_threads = 0;
45+
cpp::Atomic<uint64_t> iterations_sum = 0;
46+
cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
47+
cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
2648
cpp::Atomic<uint64_t> min = UINT64_MAX;
2749
cpp::Atomic<uint64_t> max = 0;
28-
cpp::Atomic<uint32_t> samples_sum = 0;
29-
cpp::Atomic<uint32_t> iterations_sum = 0;
30-
cpp::Atomic<clock_t> time_sum = 0;
31-
cpp::Atomic<uint64_t> active_threads = 0;
3250

3351
void reset() {
3452
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
3553
active_threads.store(0, cpp::MemoryOrder::RELAXED);
36-
cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
37-
standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
54+
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
55+
weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
56+
weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
3857
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
3958
max.store(0, cpp::MemoryOrder::RELAXED);
40-
samples_sum.store(0, cpp::MemoryOrder::RELAXED);
41-
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
42-
time_sum.store(0, cpp::MemoryOrder::RELAXED);
4359
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
4460
}
4561

4662
void update(const BenchmarkResult &result) {
4763
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
4864
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
65+
iterations_sum.fetch_add(result.total_iterations,
66+
cpp::MemoryOrder::RELAXED);
4967

50-
cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
51-
standard_deviation_sum.fetch_add(
52-
static_cast<uint64_t>(result.standard_deviation),
53-
cpp::MemoryOrder::RELAXED);
68+
const double n_i = static_cast<double>(result.total_iterations);
69+
const double mean_i = result.cycles;
70+
const double stddev_i = result.standard_deviation;
71+
const double variance_i = stddev_i * stddev_i;
72+
atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
73+
atomic_add_double(weighted_squared_cycles_sum_bits,
74+
n_i * (variance_i + mean_i * mean_i));
5475

5576
// Perform a CAS loop to atomically update the min
5677
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
@@ -66,10 +87,6 @@ struct AtomicBenchmarkSums {
6687
cpp::MemoryOrder::RELAXED))
6788
;
6889

69-
samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
70-
iterations_sum.fetch_add(result.total_iterations,
71-
cpp::MemoryOrder::RELAXED);
72-
time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
7390
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
7491
}
7592
};
@@ -79,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
7996
constexpr auto RESET = "\033[0m";
8097

8198
void print_results(Benchmark *b) {
82-
BenchmarkResult result;
99+
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
100+
101+
BenchmarkResult final_result;
83102
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
84-
int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
85-
result.cycles =
86-
all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
87-
result.standard_deviation =
88-
all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
89-
num_threads;
90-
result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
91-
result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
92-
result.samples =
93-
all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
94-
result.total_iterations =
95-
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
96-
const uint64_t duration_ns =
97-
all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
98-
const uint64_t duration_us = duration_ns / 1000;
99-
const uint64_t duration_ms = duration_ns / (1000 * 1000);
100-
uint64_t converted_duration = duration_ns;
101-
const char *time_unit;
102-
if (duration_ms != 0) {
103-
converted_duration = duration_ms;
104-
time_unit = "ms";
105-
} else if (duration_us != 0) {
106-
converted_duration = duration_us;
107-
time_unit = "us";
103+
104+
const uint32_t num_threads =
105+
all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
106+
final_result.total_iterations =
107+
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
108+
109+
if (final_result.total_iterations > 0) {
110+
const uint64_t s1_bits =
111+
all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
112+
const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
113+
cpp::MemoryOrder::RELAXED);
114+
115+
const double S1 = FPBits(s1_bits).get_val();
116+
const double S2 = FPBits(s2_bits).get_val();
117+
const double N = static_cast<double>(final_result.total_iterations);
118+
119+
const double global_mean = S1 / N;
120+
const double global_mean_of_squares = S2 / N;
121+
const double global_variance =
122+
global_mean_of_squares - (global_mean * global_mean);
123+
124+
final_result.cycles = global_mean;
125+
final_result.standard_deviation =
126+
fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
108127
} else {
109-
converted_duration = duration_ns;
110-
time_unit = "ns";
128+
final_result.cycles = 0.0;
129+
final_result.standard_deviation = 0.0;
111130
}
112-
result.total_time = converted_duration;
113-
// result.total_time =
114-
// all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
131+
132+
final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
133+
final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
115134
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
116135

117136
LIBC_NAMESPACE::printf(
118-
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
119-
b->get_test_name().data(), result.cycles, result.min, result.max,
120-
result.total_iterations, result.total_time, time_unit,
121-
static_cast<uint64_t>(result.standard_deviation), num_threads);
137+
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
138+
b->get_test_name().data(), final_result.cycles,
139+
final_result.standard_deviation, (unsigned long long)final_result.min,
140+
(unsigned long long)final_result.max,
141+
(unsigned long long)final_result.total_iterations, (unsigned)num_threads);
122142
}
123143

124144
void print_header() {
125145
LIBC_NAMESPACE::printf("%s", GREEN);
126146
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
127147
benchmarks[0]->get_suite_name().data());
128148
LIBC_NAMESPACE::printf("%s", RESET);
129-
cpp::string titles =
130-
"Benchmark | Cycles | Min | Max | "
131-
"Iterations | Time / Iteration | Stddev | Threads |\n";
149+
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
150+
" Min | Max | Iterations | Threads |\n";
132151
LIBC_NAMESPACE::printf(titles.data());
133152

134153
cpp::string separator(titles.size(), '-');
@@ -139,10 +158,8 @@ void print_header() {
139158
void Benchmark::run_benchmarks() {
140159
uint64_t id = gpu::get_thread_id();
141160

142-
if (id == 0) {
161+
if (id == 0)
143162
print_header();
144-
LIBC_NAMESPACE::srand(gpu::processor_clock());
145-
}
146163

147164
gpu::sync_threads();
148165

@@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() {
164181
}
165182

166183
BenchmarkResult benchmark(const BenchmarkOptions &options,
167-
cpp::function<uint64_t(void)> wrapper_func) {
184+
const BenchmarkTarget &target) {
168185
BenchmarkResult result;
169186
RuntimeEstimationProgression rep;
170-
uint32_t total_iterations = 0;
171187
uint32_t iterations = options.initial_iterations;
188+
172189
if (iterations < 1u)
173190
iterations = 1;
174191

175192
uint32_t samples = 0;
176193
uint64_t total_time = 0;
177-
uint64_t best_guess = 0;
178-
uint64_t cycles_squared = 0;
179194
uint64_t min = UINT64_MAX;
180195
uint64_t max = 0;
181196

182-
uint64_t overhead = UINT64_MAX;
183-
int overhead_iterations = 10;
184-
for (int i = 0; i < overhead_iterations; i++)
185-
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
197+
uint32_t call_index = 0;
186198

187199
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
188-
uint64_t sample_cycles = 0;
189-
const clock_t start = static_cast<double>(clock());
190-
for (uint32_t i = 0; i < iterations; i++) {
191-
auto wrapper_intermediate = wrapper_func();
192-
uint64_t current_result = wrapper_intermediate - overhead;
200+
RefinableRuntimeEstimator sample_estimator;
201+
202+
const clock_t start = clock();
203+
while (sample_estimator.get_iterations() < iterations) {
204+
auto current_result = target(call_index++);
193205
max = cpp::max(max, current_result);
194206
min = cpp::min(min, current_result);
195-
sample_cycles += current_result;
207+
sample_estimator.update(current_result);
196208
}
197209
const clock_t end = clock();
210+
198211
const clock_t duration_ns =
199212
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
200213
total_time += duration_ns;
201214
time_budget -= duration_ns;
202215
samples++;
203-
cycles_squared += sample_cycles * sample_cycles;
204216

205-
total_iterations += iterations;
206-
const double change_ratio =
207-
rep.compute_improvement({iterations, sample_cycles});
208-
best_guess = rep.current_estimation;
217+
const double change_ratio = rep.compute_improvement(sample_estimator);
209218

210219
if (samples >= options.max_samples || iterations >= options.max_iterations)
211220
break;
221+
222+
const auto total_iterations = rep.get_estimator().get_iterations();
223+
212224
if (total_time >= options.min_duration && samples >= options.min_samples &&
213225
total_iterations >= options.min_iterations &&
214226
change_ratio < options.epsilon)
215227
break;
216228

217-
iterations *= options.scaling_factor;
229+
iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
218230
}
219-
result.cycles = best_guess;
220-
result.standard_deviation = fputil::sqrt<double>(
221-
static_cast<double>(cycles_squared) / total_iterations -
222-
static_cast<double>(best_guess * best_guess));
231+
232+
const auto &estimator = rep.get_estimator();
233+
result.total_iterations = estimator.get_iterations();
234+
result.cycles = estimator.get_mean();
235+
result.standard_deviation = estimator.get_stddev();
223236
result.min = min;
224237
result.max = max;
225-
result.samples = samples;
226-
result.total_iterations = total_iterations;
227-
result.total_time = total_time / total_iterations;
238+
228239
return result;
229-
};
240+
}
230241

231242
} // namespace benchmarks
232243
} // namespace LIBC_NAMESPACE_DECL

0 commit comments

Comments
 (0)