/
mod.rs
1096 lines (969 loc) · 38.6 KB
/
mod.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use std::{
cell::UnsafeCell,
fmt,
mem::{self, MaybeUninit},
num::NonZeroUsize,
sync::Barrier,
thread,
};
use crate::{
black_box,
counter::{AnyCounter, CounterCollection, IntoCounter, KnownCounterKind, MaxCountUInt},
divan::SharedContext,
stats::{RawSample, Sample, SampleCollection, Stats, ThreadSample},
time::{FineDuration, Timestamp, UntaggedTimestamp},
util::{self, SyncWrap, Unit},
};
// Used for intra-doc links.
#[allow(unused)]
use crate::counter::BytesCount;
#[cfg(test)]
mod tests;
mod defer;
mod options;
use defer::{DeferSlot, DeferStore};
pub use options::BenchOptions;
pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
///
/// # Examples
///
/// ```
/// use divan::{Bencher, black_box};
///
/// #[divan::bench]
/// fn copy_from_slice(bencher: Bencher) {
/// // Input and output buffers get used in the closure.
/// let src = (0..100).collect::<Vec<i32>>();
/// let mut dst = vec![0; src.len()];
///
/// bencher.bench_local(|| {
/// black_box(&mut dst).copy_from_slice(black_box(&src));
/// });
/// }
/// ```
#[must_use = "a benchmark function must be registered"]
pub struct Bencher<'a, 'b, C = BencherConfig> {
pub(crate) context: &'a mut BenchContext<'b>,
pub(crate) config: C,
}
/// Public-in-private type for statically-typed `Bencher` configuration.
///
/// This enables configuring `Bencher` using the builder pattern with zero
/// runtime cost.
pub struct BencherConfig<GenI = Unit> {
gen_input: GenI,
}
impl<C> fmt::Debug for Bencher<'_, '_, C> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Bencher").finish_non_exhaustive()
}
}
impl<'a, 'b> Bencher<'a, 'b> {
#[inline]
pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
Self { context, config: BencherConfig { gen_input: Unit } }
}
}
impl<'a, 'b> Bencher<'a, 'b> {
/// Benchmarks a function.
///
/// The function can be benchmarked in parallel using the [`threads`
/// option](macro@crate::bench#threads). If the function is strictly
/// single-threaded, use [`Bencher::bench_local`] instead.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher.bench(|| {
/// // Benchmarked code...
/// });
/// }
/// ```
pub fn bench<O, B>(self, benched: B)
where
B: Fn() -> O + Sync,
{
// Reusing `bench_values` for a zero-sized non-drop input type should
// have no overhead.
self.with_inputs(|| ()).bench_values(|_: ()| benched());
}
/// Benchmarks a function on the current thread.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher.bench_local(|| {
/// // Benchmarked code...
/// });
/// }
/// ```
pub fn bench_local<O, B>(self, mut benched: B)
where
B: FnMut() -> O,
{
// Reusing `bench_local_values` for a zero-sized non-drop input type
// should have no overhead.
self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
}
/// Generate inputs for the [benchmarked function](#input-bench).
///
/// Time spent generating inputs does not affect benchmark timing.
///
/// When [benchmarking in parallel](macro@crate::bench#threads), the input
/// generator is called on the same thread as the sample loop that uses that
/// input.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher
/// .with_inputs(|| {
/// // Generate input:
/// String::from("...")
/// })
/// .bench_values(|s| {
/// // Use input by-value:
/// s + "123"
/// });
/// }
/// ```
pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
Bencher { context: self.context, config: BencherConfig { gen_input } }
}
}
impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
/// Assign a [`Counter`](crate::counter::Counter) for all iterations of the
/// benchmarked function.
///
/// This will either:
/// - Assign a new counter
/// - Override an existing counter of the same type
///
/// If the counter depends on [generated inputs](Self::with_inputs), use
/// [`Bencher::input_counter`] instead.
///
/// If context is not needed, the counter can instead be set via
/// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
///
/// # Examples
///
/// ```
/// use divan::{Bencher, counter::BytesCount};
///
/// #[divan::bench]
/// fn char_count(bencher: Bencher) {
/// let s: String = // ...
/// # String::new();
///
/// bencher
/// .counter(BytesCount::of_str(&s))
/// .bench(|| {
/// divan::black_box(&s).chars().count()
/// });
/// }
/// ```
#[doc(alias = "throughput")]
pub fn counter<C>(self, counter: C) -> Self
where
C: IntoCounter,
{
let counter = AnyCounter::new(counter);
self.context.counters.set_counter(counter);
self
}
}
/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
where
GenI: FnMut() -> I,
{
/// Create a [`Counter`](crate::counter::Counter) for each input of the
/// benchmarked function.
///
/// This will either:
/// - Assign a new counter
/// - Override an existing counter of the same type
///
/// If the counter is constant, use [`Bencher::counter`] instead.
///
/// When [benchmarking in parallel](macro@crate::bench#threads), the input
/// counter is called on the same thread as the sample loop that generates
/// and uses that input.
///
/// # Examples
///
/// The following example emits info for the number of bytes processed when
/// benchmarking [`char`-counting](std::str::Chars::count). The byte count
/// is gotten by calling [`BytesCount::of_str`] on each iteration's input
/// [`String`].
///
/// ```
/// use divan::{Bencher, counter::BytesCount};
///
/// #[divan::bench]
/// fn char_count(bencher: Bencher) {
/// bencher
/// .with_inputs(|| -> String {
/// // ...
/// # String::new()
/// })
/// .input_counter(BytesCount::of_str)
/// .bench_refs(|s| {
/// s.chars().count()
/// });
/// }
/// ```
pub fn input_counter<C, F>(self, make_counter: F) -> Self
where
F: Fn(&I) -> C + Sync + 'static,
C: IntoCounter,
{
self.context.counters.set_input_counter(make_counter);
self
}
/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
/// provided by-value.
///
/// Per-iteration means the benchmarked function is called exactly once for
/// each generated input.
///
/// The function can be benchmarked in parallel using the [`threads`
/// option](macro@crate::bench#threads). If the function is strictly
/// single-threaded, use [`Bencher::bench_local_values`] instead.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher
/// .with_inputs(|| {
/// // Generate input:
/// String::from("...")
/// })
/// .bench_values(|s| {
/// // Use input by-value:
/// s + "123"
/// });
/// }
/// ```
pub fn bench_values<O, B>(self, benched: B)
where
B: Fn(I) -> O + Sync,
GenI: Fn() -> I + Sync,
{
self.context.bench_loop_threaded(
self.config.gen_input,
|input| {
// SAFETY: Input is guaranteed to be initialized and not
// currently referenced by anything else.
let input = unsafe { input.get().read().assume_init() };
benched(input)
},
// Input ownership is transferred to `benched`.
|_input| {},
);
}
/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
/// provided by-value.
///
/// Per-iteration means the benchmarked function is called exactly once for
/// each generated input.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// let mut values = Vec::new();
/// bencher
/// .with_inputs(|| {
/// // Generate input:
/// String::from("...")
/// })
/// .bench_local_values(|s| {
/// // Use input by-value:
/// values.push(s);
/// });
/// }
/// ```
pub fn bench_local_values<O, B>(self, mut benched: B)
where
B: FnMut(I) -> O,
{
self.context.bench_loop_local(
self.config.gen_input,
|input| {
// SAFETY: Input is guaranteed to be initialized and not
// currently referenced by anything else.
let input = unsafe { input.get().read().assume_init() };
benched(input)
},
// Input ownership is transferred to `benched`.
|_input| {},
);
}
/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
/// provided by-reference.
///
/// Per-iteration means the benchmarked function is called exactly once for
/// each generated input.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher
/// .with_inputs(|| {
/// // Generate input:
/// String::from("...")
/// })
/// .bench_refs(|s| {
/// // Use input by-reference:
/// *s += "123";
/// });
/// }
/// ```
pub fn bench_refs<O, B>(self, benched: B)
where
B: Fn(&mut I) -> O + Sync,
GenI: Fn() -> I + Sync,
{
// TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
self.context.bench_loop_threaded(
self.config.gen_input,
|input| {
// SAFETY: Input is guaranteed to be initialized and not
// currently referenced by anything else.
let input = unsafe { (*input.get()).assume_init_mut() };
benched(input)
},
// Input ownership was not transferred to `benched`.
|input| {
// SAFETY: This function is called after `benched` outputs are
// dropped, so we have exclusive access.
unsafe { (*input.get()).assume_init_drop() }
},
);
}
/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
/// provided by-reference.
///
/// Per-iteration means the benchmarked function is called exactly once for
/// each generated input.
///
/// # Examples
///
/// ```
/// #[divan::bench]
/// fn bench(bencher: divan::Bencher) {
/// bencher
/// .with_inputs(|| {
/// // Generate input:
/// String::from("...")
/// })
/// .bench_local_refs(|s| {
/// // Use input by-reference:
/// *s += "123";
/// });
/// }
/// ```
pub fn bench_local_refs<O, B>(self, mut benched: B)
where
B: FnMut(&mut I) -> O,
{
// TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
self.context.bench_loop_local(
self.config.gen_input,
|input| {
// SAFETY: Input is guaranteed to be initialized and not
// currently referenced by anything else.
let input = unsafe { (*input.get()).assume_init_mut() };
benched(input)
},
// Input ownership was not transferred to `benched`.
|input| {
// SAFETY: This function is called after `benched` outputs are
// dropped, so we have exclusive access.
unsafe { (*input.get()).assume_init_drop() }
},
);
}
}
/// State machine for how the benchmark is being run.
#[derive(Clone, Copy)]
pub(crate) enum BenchMode {
/// The benchmark is being run as `--test`.
///
/// Don't collect samples and run exactly once.
Test,
/// Scale `sample_size` to determine the right size for collecting.
Tune { sample_size: u32 },
/// Simply collect samples.
Collect { sample_size: u32 },
}
impl BenchMode {
#[inline]
pub fn is_test(self) -> bool {
matches!(self, Self::Test)
}
#[inline]
pub fn is_tune(self) -> bool {
matches!(self, Self::Tune { .. })
}
#[inline]
pub fn is_collect(self) -> bool {
matches!(self, Self::Collect { .. })
}
#[inline]
pub fn sample_size(self) -> u32 {
match self {
Self::Test => 1,
Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
}
}
}
/// `#[divan::bench]` loop context.
///
/// Functions called within the benchmark loop should be `#[inline(always)]` to
/// ensure instruction cache locality.
pub(crate) struct BenchContext<'a> {
shared_context: &'a SharedContext,
/// User-configured options.
pub options: &'a BenchOptions,
/// Whether the benchmark loop was started.
pub did_run: bool,
/// The number of threads to run the benchmark. The default is 1.
///
/// When set to 1, the benchmark loop is guaranteed to stay on the current
/// thread and not spawn any threads.
pub thread_count: NonZeroUsize,
/// Recorded samples.
samples: SampleCollection,
/// Per-iteration counters grouped by sample.
counters: CounterCollection,
}
impl<'a> BenchContext<'a> {
/// Creates a new benchmarking context.
pub fn new(
shared_context: &'a SharedContext,
options: &'a BenchOptions,
thread_count: NonZeroUsize,
) -> Self {
Self {
shared_context,
options,
thread_count,
did_run: false,
samples: SampleCollection::default(),
counters: options.counters.to_collection(),
}
}
/// Runs the single-threaded loop for benchmarking `benched`.
///
/// # Safety
///
/// See `bench_loop_threaded`.
pub fn bench_loop_local<I, O>(
&mut self,
gen_input: impl FnMut() -> I,
benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
) {
// SAFETY: Closures are guaranteed to run on the current thread, so they
// can safely be mutable and non-`Sync`.
unsafe {
let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
let benched = SyncWrap::new(UnsafeCell::new(benched));
let drop_input = SyncWrap::new(drop_input);
self.thread_count = NonZeroUsize::MIN;
self.bench_loop_threaded::<I, O>(
|| (*gen_input.get())(),
|input| (*benched.get())(input),
|input| drop_input(input),
)
}
}
/// Runs the multi-threaded loop for benchmarking `benched`.
///
/// # Safety
///
/// If `self.threads` is 1, the incoming closures will not escape the
/// current thread. This guarantee ensures `bench_loop_local` can soundly
/// reuse this method with mutable non-`Sync` closures.
///
/// When `benched` is called:
/// - `I` is guaranteed to be initialized.
/// - No external `&I` or `&mut I` exists.
///
/// When `drop_input` is called:
/// - All instances of `O` returned from `benched` have been dropped.
/// - The same guarantees for `I` apply as in `benched`, unless `benched`
/// escaped references to `I`.
fn bench_loop_threaded<I, O>(
&mut self,
gen_input: impl Fn() -> I + Sync,
benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
) {
self.did_run = true;
let mut current_mode = self.initial_mode();
let is_test = current_mode.is_test();
let record_sample = self.sample_recorder(gen_input, benched, drop_input);
let mut defer_store = DeferStore::default();
let thread_count = self.thread_count.get();
let aux_thread_count = thread_count - 1;
let is_single_thread = aux_thread_count == 0;
let is_multi_thread = !is_single_thread;
// Per-thread sample info returned by `record_sample`. These are
// processed locally to emit user-facing sample info. As a result, this
// only contains `thread_count` many elements at a time.
let mut raw_samples = Vec::<RawSample>::new();
// The time spent benchmarking, in picoseconds.
//
// Unless `skip_ext_time` is set, this includes time external to
// `benched`, such as time spent generating inputs and running drop.
let mut elapsed_picos: u128 = 0;
// The minimum time for benchmarking, in picoseconds.
let min_picos = self.options.min_time().picos;
// The remaining time left for benchmarking, in picoseconds.
let max_picos = self.options.max_time().picos;
// Don't bother running if user specifies 0 max time or 0 samples.
if max_picos == 0 || !self.options.has_samples() {
return;
}
let timer = self.shared_context.timer;
let timer_kind = timer.kind();
let mut rem_samples = if current_mode.is_collect() {
Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
} else {
None
};
// Only measure precision if we need to tune sample size.
let timer_precision =
if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
if !is_test {
self.samples.all.reserve(self.options.sample_count.unwrap_or(1) as usize);
}
let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
while {
// Conditions for when sampling is over:
if elapsed_picos >= max_picos {
// Depleted the benchmarking time budget. This is a strict
// condition regardless of sample count and minimum time.
false
} else if rem_samples.unwrap_or(1) > 0 {
// More samples expected.
true
} else {
// Continue if we haven't reached the time floor.
elapsed_picos < min_picos
}
} {
let sample_size = current_mode.sample_size();
self.samples.sample_size = sample_size;
let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
// Sample loop helper:
let record_sample = |defer_store: &mut DeferStore<I, O>| -> RawSample {
let mut counter_totals: [u128; KnownCounterKind::COUNT] =
[0; KnownCounterKind::COUNT];
// Updates per-input counter info for this sample.
let mut count_input = |input: &I| {
for counter_kind in KnownCounterKind::ALL {
// SAFETY: The `I` type cannot change since `with_inputs`
// cannot be called more than once on the same `Bencher`.
if let Some(count) =
unsafe { self.counters.get_input_count(counter_kind, input) }
{
let total = &mut counter_totals[counter_kind as usize];
*total = (*total).saturating_add(count as u128);
}
}
};
// Sample loop:
let [start, end] = record_sample(
sample_size as usize,
barrier.as_ref(),
defer_store,
&mut count_input,
);
RawSample { start, end, timer, counter_totals }
};
// Sample loop:
raw_samples.clear();
if is_single_thread {
let sample = record_sample(&mut defer_store);
if !is_test {
raw_samples.push(sample);
}
} else {
// TODO: Reuse auxiliary threads across samples.
thread::scope(|scope| {
let thread_handles: Vec<_> = (0..aux_thread_count)
.map(|_| scope.spawn(|| record_sample(&mut DeferStore::default())))
.collect();
let local_sample = record_sample(&mut defer_store);
if !is_test {
raw_samples.extend(
thread_handles
.into_iter()
.map(|handle| {
// Propagate panics to behave the same as
// automatic joining.
handle
.join()
.unwrap_or_else(|error| std::panic::resume_unwind(error))
})
.chain(Some(local_sample)),
);
}
});
}
#[cfg(test)]
if is_test {
// '--test' should run the expected number of times but not
// allocate any samples.
assert_eq!(raw_samples.capacity(), 0);
} else {
assert_eq!(raw_samples.len(), thread_count);
}
// If testing, exit the benchmarking loop immediately after timing a
// single run.
if is_test {
break;
}
let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
let slowest_time = slowest_sample.duration();
// TODO: Make tuning be less influenced by early runs. Currently if
// early runs are very quick but later runs are slow, benchmarking
// will take a very long time.
//
// TODO: Make `sample_size` consider time generating inputs and
// dropping inputs/outputs. Currently benchmarks like
// `Bencher::bench_refs(String::clear)` take a very long time.
if current_mode.is_tune() {
// Clear previous smaller samples.
self.samples.clear();
self.counters.clear_input_counts();
// If within 100x timer precision, continue tuning.
let precision_multiple = slowest_time.picos / timer_precision.picos;
if precision_multiple <= 100 {
current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
} else {
current_mode = BenchMode::Collect { sample_size };
rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
}
}
// Account the sample duration for the per-sample benchmarking
// overhead.
let sub_sample_overhead = {
let overhead =
self.shared_context.bench_overhead.picos.saturating_mul(sample_size as u128);
move |d: FineDuration| {
FineDuration {
picos: d.clamp_to(timer_precision).picos.saturating_sub(overhead),
}
.clamp_to(timer_precision)
}
};
if is_multi_thread {
// The total wall clock time spent over the current
// multi-threaded sample set.
let total_wall_time = {
let first_start = raw_samples.iter().map(|s| s.start).min().unwrap();
let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
sub_sample_overhead(last_end.duration_since(first_start, timer))
};
self.samples.threads.push(ThreadSample { total_wall_time });
}
for raw_sample in &raw_samples {
self.samples
.all
.push(Sample { duration: sub_sample_overhead(raw_sample.duration()) });
// Insert per-input counter information.
for counter_kind in KnownCounterKind::ALL {
if !self.counters.uses_input_counts(counter_kind) {
continue;
}
let total_count = raw_sample.counter_totals[counter_kind as usize];
// Cannot overflow `MaxCountUInt` because `total_count`
// cannot exceed `MaxCountUInt::MAX * sample_size`.
let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
}
if let Some(rem_samples) = &mut rem_samples {
*rem_samples = rem_samples.saturating_sub(1);
}
}
if let Some(initial_start) = initial_start {
let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
elapsed_picos = last_end.duration_since(initial_start, timer).picos;
} else {
// Progress by at least 1ns to prevent extremely fast
// functions from taking forever when `min_time` is set.
let progress_picos = slowest_time.picos.max(1_000);
elapsed_picos = elapsed_picos.saturating_add(progress_picos);
}
}
}
/// Returns a closure that takes the sample size and input counter, and then
/// returns a newly recorded sample.
fn sample_recorder<I, O>(
&self,
gen_input: impl Fn() -> I,
benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
) -> impl Fn(usize, Option<&Barrier>, &mut DeferStore<I, O>, &mut dyn FnMut(&I)) -> [Timestamp; 2]
{
// We defer:
// - Usage of `gen_input` values.
// - Drop destructor for `O`, preventing it from affecting sample
// measurements. Outputs are stored into a pre-allocated buffer during
// the sample loop. The allocation is reused between samples to reduce
// time spent between samples.
let timer_kind = self.shared_context.timer.kind();
move |sample_size: usize,
barrier: Option<&Barrier>,
defer_store: &mut DeferStore<I, O>,
count_input: &mut dyn FnMut(&I)| {
// Ensures:
// - All threads start the timed section simultaneously.
// - Work external to the timed section does not affect the timing
// of other threads.
let sync_threads = || {
if let Some(barrier) = barrier {
barrier.wait();
}
};
// The following logic chooses how to efficiently sample the
// benchmark function once and assigns `sample_start`/`sample_end`
// before/after the sample loop.
//
// NOTE: Testing and benchmarking should behave exactly the same
// when getting the sample time span. We don't want to introduce
// extra work that may worsen measurement quality for real
// benchmarking.
let sample_start: UntaggedTimestamp;
let sample_end: UntaggedTimestamp;
if mem::size_of::<I>() == 0 && (mem::size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
// Use a range instead of `defer_store` to make the benchmarking
// loop cheaper.
// Run `gen_input` the expected number of times in case it
// updates external state used by `benched`.
for _ in 0..sample_size {
let input = gen_input();
count_input(&input);
// Inputs are consumed/dropped later.
mem::forget(input);
}
sync_threads();
sample_start = UntaggedTimestamp::start(timer_kind);
// Sample loop:
for _ in 0..sample_size {
// SAFETY: Input is a ZST, so we can construct one out of
// thin air.
let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
mem::forget(black_box(benched(&input)));
}
sample_end = UntaggedTimestamp::end(timer_kind);
sync_threads();
// Drop outputs and inputs.
for _ in 0..sample_size {
// Output only needs drop if ZST.
if mem::size_of::<O>() == 0 {
// SAFETY: Output is a ZST, so we can construct one out
// of thin air.
unsafe { _ = mem::zeroed::<O>() }
}
if mem::needs_drop::<I>() {
// SAFETY: Input is a ZST, so we can construct one out
// of thin air and not worry about aliasing.
unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
}
}
} else {
defer_store.prepare(sample_size);
match defer_store.slots() {
// Output needs to be dropped. We defer drop in the sample
// loop by inserting it into `defer_store`.
Ok(defer_slots_slice) => {
// Initialize and store inputs.
for DeferSlot { input, .. } in defer_slots_slice {
// SAFETY: We have exclusive access to `input`.
let input = unsafe { &mut *input.get() };
let input = input.write(gen_input());
count_input(input);
// Make input opaque to benchmarked function.
black_box(input);
}
// Create iterator before the sample timing section to
// reduce benchmarking overhead.
let defer_slots_iter = defer_slots_slice.iter();
sync_threads();
sample_start = UntaggedTimestamp::start(timer_kind);
// Sample loop:
for defer_slot in defer_slots_iter {
// SAFETY: All inputs in `defer_store` were
// initialized and we have exclusive access to the
// output slot.
unsafe {
let output = benched(&defer_slot.input);
*defer_slot.output.get() = MaybeUninit::new(output);
}
// PERF: `black_box` the slot address because:
// - It prevents `input` mutation from being
// optimized out.
// - `black_box` writes its input to the stack.
// Using the slot address instead of the output
// by-value reduces overhead when `O` is a larger
// type like `String` since then it will write a
// single word instead of three words.
_ = black_box(defer_slot);
}
sample_end = UntaggedTimestamp::end(timer_kind);
sync_threads();
// Drop outputs and inputs.
for DeferSlot { input, output } in defer_slots_slice {
// SAFETY: All outputs were initialized in the
// sample loop and we have exclusive access.
unsafe { (*output.get()).assume_init_drop() }
if mem::needs_drop::<I>() {
// SAFETY: The output was dropped and thus we
// have exclusive access to inputs.
unsafe { drop_input(input) }
}
}
}
// Output does not need to be dropped.
Err(defer_inputs_slice) => {
// Initialize and store inputs.
for input in defer_inputs_slice {
// SAFETY: We have exclusive access to `input`.
let input = unsafe { &mut *input.get() };
let input = input.write(gen_input());
count_input(input);
// Make input opaque to benchmarked function.
black_box(input);
}
// Create iterator before the sample timing section to
// reduce benchmarking overhead.
let defer_inputs_iter = defer_inputs_slice.iter();
sync_threads();
sample_start = UntaggedTimestamp::start(timer_kind);
// Sample loop:
for input in defer_inputs_iter {
// SAFETY: All inputs in `defer_store` were
// initialized.
_ = black_box(unsafe { benched(input) });
}
sample_end = UntaggedTimestamp::end(timer_kind);
sync_threads();
// Drop inputs.
if mem::needs_drop::<I>() {
for input in defer_inputs_slice {
// SAFETY: We have exclusive access to inputs.
unsafe { drop_input(input) }
}
}
}
}
}
// SAFETY: These values are guaranteed to be the correct variant
// because they were created from the same `timer_kind`.
unsafe {
[sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
}
}
}
#[inline]
fn initial_mode(&self) -> BenchMode {
if self.shared_context.action.is_test() {
BenchMode::Test