1
1
#include " LibcGpuBenchmark.h"
2
+
3
+ #include " hdr/stdint_proxy.h"
2
4
#include " src/__support/CPP/algorithm.h"
3
5
#include " src/__support/CPP/array.h"
4
6
#include " src/__support/CPP/atomic.h"
5
7
#include " src/__support/CPP/string.h"
8
+ #include " src/__support/FPUtil/FPBits.h"
6
9
#include " src/__support/FPUtil/sqrt.h"
7
10
#include " src/__support/GPU/utils.h"
8
11
#include " src/__support/fixedvector.h"
9
12
#include " src/__support/macros/config.h"
10
13
#include " src/__support/time/gpu/time_utils.h"
11
14
#include " src/stdio/printf.h"
12
- #include " src/stdlib/srand.h"
13
15
14
16
namespace LIBC_NAMESPACE_DECL {
15
17
namespace benchmarks {
@@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
20
22
benchmarks.push_back (benchmark);
21
23
}
22
24
25
+ static void atomic_add_double (cpp::Atomic<uint64_t > &atomic_bits,
26
+ double value) {
27
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
28
+
29
+ uint64_t expected_bits = atomic_bits.load (cpp::MemoryOrder::RELAXED);
30
+
31
+ while (true ) {
32
+ double current_value = FPBits (expected_bits).get_val ();
33
+ double next_value = current_value + value;
34
+
35
+ uint64_t desired_bits = FPBits (next_value).uintval ();
36
+ if (atomic_bits.compare_exchange_strong (expected_bits, desired_bits,
37
+ cpp::MemoryOrder::ACQUIRE,
38
+ cpp::MemoryOrder::RELAXED))
39
+ break ;
40
+ }
41
+ }
42
+
23
43
struct AtomicBenchmarkSums {
24
- cpp::Atomic<uint64_t > cycles_sum = 0 ;
25
- cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
44
+ cpp::Atomic<uint32_t > active_threads = 0 ;
45
+ cpp::Atomic<uint64_t > iterations_sum = 0 ;
46
+ cpp::Atomic<uint64_t > weighted_cycles_sum_bits = 0 ;
47
+ cpp::Atomic<uint64_t > weighted_squared_cycles_sum_bits = 0 ;
26
48
cpp::Atomic<uint64_t > min = UINT64_MAX;
27
49
cpp::Atomic<uint64_t > max = 0 ;
28
- cpp::Atomic<uint32_t > samples_sum = 0 ;
29
- cpp::Atomic<uint32_t > iterations_sum = 0 ;
30
- cpp::Atomic<clock_t > time_sum = 0 ;
31
- cpp::Atomic<uint64_t > active_threads = 0 ;
32
50
33
51
void reset () {
34
52
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
35
53
active_threads.store (0 , cpp::MemoryOrder::RELAXED);
36
- cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
37
- standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
54
+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
55
+ weighted_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
56
+ weighted_squared_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
38
57
min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
39
58
max.store (0 , cpp::MemoryOrder::RELAXED);
40
- samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
41
- iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
42
- time_sum.store (0 , cpp::MemoryOrder::RELAXED);
43
59
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
44
60
}
45
61
46
62
void update (const BenchmarkResult &result) {
47
63
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
48
64
active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
65
+ iterations_sum.fetch_add (result.total_iterations ,
66
+ cpp::MemoryOrder::RELAXED);
49
67
50
- cycles_sum.fetch_add (result.cycles , cpp::MemoryOrder::RELAXED);
51
- standard_deviation_sum.fetch_add (
52
- static_cast <uint64_t >(result.standard_deviation ),
53
- cpp::MemoryOrder::RELAXED);
68
+ const double n_i = static_cast <double >(result.total_iterations );
69
+ const double mean_i = result.cycles ;
70
+ const double stddev_i = result.standard_deviation ;
71
+ const double variance_i = stddev_i * stddev_i;
72
+ atomic_add_double (weighted_cycles_sum_bits, n_i * mean_i);
73
+ atomic_add_double (weighted_squared_cycles_sum_bits,
74
+ n_i * (variance_i + mean_i * mean_i));
54
75
55
76
// Perform a CAS loop to atomically update the min
56
77
uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
@@ -66,10 +87,6 @@ struct AtomicBenchmarkSums {
66
87
cpp::MemoryOrder::RELAXED))
67
88
;
68
89
69
- samples_sum.fetch_add (result.samples , cpp::MemoryOrder::RELAXED);
70
- iterations_sum.fetch_add (result.total_iterations ,
71
- cpp::MemoryOrder::RELAXED);
72
- time_sum.fetch_add (result.total_time , cpp::MemoryOrder::RELAXED);
73
90
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
74
91
}
75
92
};
@@ -79,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
79
96
constexpr auto RESET = " \033 [0m" ;
80
97
81
98
void print_results (Benchmark *b) {
82
- BenchmarkResult result;
99
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
100
+
101
+ BenchmarkResult final_result;
83
102
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
84
- int num_threads = all_results. active_threads . load (cpp::MemoryOrder::RELAXED);
85
- result. cycles =
86
- all_results.cycles_sum .load (cpp::MemoryOrder::RELAXED) / num_threads ;
87
- result. standard_deviation =
88
- all_results.standard_deviation_sum .load (cpp::MemoryOrder::RELAXED) /
89
- num_threads;
90
- result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
91
- result. max = all_results. max . load (cpp::MemoryOrder::RELAXED);
92
- result. samples =
93
- all_results.samples_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
94
- result. total_iterations =
95
- all_results. iterations_sum . load (cpp::MemoryOrder::RELAXED) / num_threads;
96
- const uint64_t duration_ns =
97
- all_results. time_sum . load (cpp::MemoryOrder::RELAXED) / num_threads ;
98
- const uint64_t duration_us = duration_ns / 1000 ;
99
- const uint64_t duration_ms = duration_ns / ( 1000 * 1000 );
100
- uint64_t converted_duration = duration_ns ;
101
- const char *time_unit ;
102
- if (duration_ms != 0 ) {
103
- converted_duration = duration_ms ;
104
- time_unit = " ms " ;
105
- } else if (duration_us != 0 ) {
106
- converted_duration = duration_us;
107
- time_unit = " us " ;
103
+
104
+ const uint32_t num_threads =
105
+ all_results.active_threads .load (cpp::MemoryOrder::RELAXED);
106
+ final_result. total_iterations =
107
+ all_results.iterations_sum .load (cpp::MemoryOrder::RELAXED);
108
+
109
+ if (final_result. total_iterations > 0 ) {
110
+ const uint64_t s1_bits =
111
+ all_results. weighted_cycles_sum_bits . load (cpp::MemoryOrder::RELAXED);
112
+ const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits .load (
113
+ cpp::MemoryOrder::RELAXED);
114
+
115
+ const double S1 = FPBits (s1_bits). get_val ();
116
+ const double S2 = FPBits (s2_bits). get_val () ;
117
+ const double N = static_cast < double >(final_result. total_iterations ) ;
118
+
119
+ const double global_mean = S1 / N ;
120
+ const double global_mean_of_squares = S2 / N ;
121
+ const double global_variance =
122
+ global_mean_of_squares - (global_mean * global_mean) ;
123
+
124
+ final_result. cycles = global_mean;
125
+ final_result. standard_deviation =
126
+ fputil::sqrt< double >(global_variance < 0.0 ? 0.0 : global_variance) ;
108
127
} else {
109
- converted_duration = duration_ns ;
110
- time_unit = " ns " ;
128
+ final_result. cycles = 0.0 ;
129
+ final_result. standard_deviation = 0.0 ;
111
130
}
112
- result. total_time = converted_duration;
113
- // result.total_time =
114
- // all_results.time_sum .load(cpp::MemoryOrder::RELAXED) / num_threads ;
131
+
132
+ final_result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
133
+ final_result. max = all_results.max .load (cpp::MemoryOrder::RELAXED);
115
134
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
116
135
117
136
LIBC_NAMESPACE::printf (
118
- " %-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n " ,
119
- b->get_test_name ().data (), result.cycles , result.min , result.max ,
120
- result.total_iterations , result.total_time , time_unit,
121
- static_cast <uint64_t >(result.standard_deviation ), num_threads);
137
+ " %-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n " ,
138
+ b->get_test_name ().data (), final_result.cycles ,
139
+ final_result.standard_deviation , (unsigned long long )final_result.min ,
140
+ (unsigned long long )final_result.max ,
141
+ (unsigned long long )final_result.total_iterations , (unsigned )num_threads);
122
142
}
123
143
124
144
void print_header () {
125
145
LIBC_NAMESPACE::printf (" %s" , GREEN);
126
146
LIBC_NAMESPACE::printf (" Running Suite: %-10s\n " ,
127
147
benchmarks[0 ]->get_suite_name ().data ());
128
148
LIBC_NAMESPACE::printf (" %s" , RESET);
129
- cpp::string titles =
130
- " Benchmark | Cycles | Min | Max | "
131
- " Iterations | Time / Iteration | Stddev | Threads |\n " ;
149
+ cpp::string titles = " Benchmark | Cycles (Mean) | Stddev | "
150
+ " Min | Max | Iterations | Threads |\n " ;
132
151
LIBC_NAMESPACE::printf (titles.data ());
133
152
134
153
cpp::string separator (titles.size (), ' -' );
@@ -139,10 +158,8 @@ void print_header() {
139
158
void Benchmark::run_benchmarks () {
140
159
uint64_t id = gpu::get_thread_id ();
141
160
142
- if (id == 0 ) {
161
+ if (id == 0 )
143
162
print_header ();
144
- LIBC_NAMESPACE::srand (gpu::processor_clock ());
145
- }
146
163
147
164
gpu::sync_threads ();
148
165
@@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() {
164
181
}
165
182
166
183
BenchmarkResult benchmark (const BenchmarkOptions &options,
167
- cpp::function< uint64_t ( void )> wrapper_func ) {
184
+ const BenchmarkTarget &target ) {
168
185
BenchmarkResult result;
169
186
RuntimeEstimationProgression rep;
170
- uint32_t total_iterations = 0 ;
171
187
uint32_t iterations = options.initial_iterations ;
188
+
172
189
if (iterations < 1u )
173
190
iterations = 1 ;
174
191
175
192
uint32_t samples = 0 ;
176
193
uint64_t total_time = 0 ;
177
- uint64_t best_guess = 0 ;
178
- uint64_t cycles_squared = 0 ;
179
194
uint64_t min = UINT64_MAX;
180
195
uint64_t max = 0 ;
181
196
182
- uint64_t overhead = UINT64_MAX;
183
- int overhead_iterations = 10 ;
184
- for (int i = 0 ; i < overhead_iterations; i++)
185
- overhead = cpp::min (overhead, LIBC_NAMESPACE::overhead ());
197
+ uint32_t call_index = 0 ;
186
198
187
199
for (int64_t time_budget = options.max_duration ; time_budget >= 0 ;) {
188
- uint64_t sample_cycles = 0 ;
189
- const clock_t start = static_cast < double >( clock ());
190
- for ( uint32_t i = 0 ; i < iterations; i++) {
191
- auto wrapper_intermediate = wrapper_func ();
192
- uint64_t current_result = wrapper_intermediate - overhead ;
200
+ RefinableRuntimeEstimator sample_estimator ;
201
+
202
+ const clock_t start = clock ();
203
+ while (sample_estimator. get_iterations () < iterations) {
204
+ auto current_result = target (call_index++) ;
193
205
max = cpp::max (max, current_result);
194
206
min = cpp::min (min, current_result);
195
- sample_cycles += current_result;
207
+ sample_estimator. update ( current_result) ;
196
208
}
197
209
const clock_t end = clock ();
210
+
198
211
const clock_t duration_ns =
199
212
((end - start) * 1000 * 1000 * 1000 ) / CLOCKS_PER_SEC;
200
213
total_time += duration_ns;
201
214
time_budget -= duration_ns;
202
215
samples++;
203
- cycles_squared += sample_cycles * sample_cycles;
204
216
205
- total_iterations += iterations;
206
- const double change_ratio =
207
- rep.compute_improvement ({iterations, sample_cycles});
208
- best_guess = rep.current_estimation ;
217
+ const double change_ratio = rep.compute_improvement (sample_estimator);
209
218
210
219
if (samples >= options.max_samples || iterations >= options.max_iterations )
211
220
break ;
221
+
222
+ const auto total_iterations = rep.get_estimator ().get_iterations ();
223
+
212
224
if (total_time >= options.min_duration && samples >= options.min_samples &&
213
225
total_iterations >= options.min_iterations &&
214
226
change_ratio < options.epsilon )
215
227
break ;
216
228
217
- iterations *= options.scaling_factor ;
229
+ iterations = static_cast < uint32_t >(iterations * options.scaling_factor ) ;
218
230
}
219
- result.cycles = best_guess;
220
- result.standard_deviation = fputil::sqrt<double >(
221
- static_cast <double >(cycles_squared) / total_iterations -
222
- static_cast <double >(best_guess * best_guess));
231
+
232
+ const auto &estimator = rep.get_estimator ();
233
+ result.total_iterations = estimator.get_iterations ();
234
+ result.cycles = estimator.get_mean ();
235
+ result.standard_deviation = estimator.get_stddev ();
223
236
result.min = min;
224
237
result.max = max;
225
- result.samples = samples;
226
- result.total_iterations = total_iterations;
227
- result.total_time = total_time / total_iterations;
238
+
228
239
return result;
229
- };
240
+ }
230
241
231
242
} // namespace benchmarks
232
243
} // namespace LIBC_NAMESPACE_DECL
0 commit comments