diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt index 03e84ad800..6d5f0c7019 100644 --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -10,6 +10,7 @@ endif() llvm_test_run() llvm_test_executable(LoopVectorizationBenchmarks + ConditionalScalarAssignment.cpp main.cpp MathFunctions.cpp RuntimeChecks.cpp diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp new file mode 100644 index 0000000000..9470465855 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp @@ -0,0 +1,222 @@ +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ITERATIONS 100000 + +template +using CSAFunc = T (*)(T *, T *, T *, T); + +// Find the last element in A above the given threshold, +// with default loop vectorization settings. +template +static T run_single_csa_only_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; + for (unsigned i = 0; i < ITERATIONS; i++) + if (A[i] > Threshold) + Result = A[i]; + + return Result; +} + +// Find the last element in A above the given threshold, +// with loop vectorization disabled. +template +static T run_single_csa_only_novec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) + if (A[i] > Threshold) + Result = A[i]; + + return Result; +} + +// Find the last elements in A, B, and C above the given threshold, +// with default loop vectorization settings. +template +static T run_multi_csa_only_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A', 'B', and 'C' in init_data below. + T ResultA = 101; + T ResultB = 101; + T ResultC = 101; + for (unsigned i = 0; i < ITERATIONS; i++) { + if (A[i] > Threshold) + ResultA = A[i]; + if (B[i] > Threshold) + ResultB = B[i]; + if (C[i] > Threshold) + ResultC = C[i]; + } + + return ResultA ^ ResultB ^ ResultC; +} + +// Find the last elements in A, B, and C above the given threshold, +// with loop vectorization disabled. +template +static T run_multi_csa_only_novec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A', 'B', and 'C' in init_data below. + T ResultA = 101; + T ResultB = 101; + T ResultC = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) { + if (A[i] > Threshold) + ResultA = A[i]; + if (B[i] > Threshold) + ResultB = B[i]; + if (C[i] > Threshold) + ResultC = C[i]; + } + + return ResultA ^ ResultB ^ ResultC; +} + +// Find the last element in A above the given threshold, +// with default loop vectorization settings. +template +static T run_csa_with_arith_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; + for (unsigned i = 0; i < ITERATIONS; i++) { + // Do some work to make the difference noticeable + C[i] = A[i] * 13 + B[i] * 5; + if (A[i] > Threshold) + Result = A[i]; + } + + return Result; +} + +// Find the last element in A above the given threshold, +// with loop vectorization disabled. +template +static T run_csa_with_arith_novec(T *A, T *B, T* C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) { + // Do some work to make the difference noticeable + C[i] = A[i] * 13 + B[i] * 5; + if (A[i] > Threshold) + Result = A[i]; + } + + return Result; +} + +// Initialize arrays A, B, and C with random numbers +template static void init_data(T *A, T* B, T *C) { + std::uniform_int_distribution dist(0, 100); + std::mt19937 rng(12345); + for (unsigned i = 0; i < ITERATIONS; i++) { + A[i] = dist(rng); + B[i] = dist(rng); + C[i] = dist(rng); + } +} + +// Benchmark auto-vectorized version. +template +static void __attribute__((always_inline)) +benchmark_csa_autovec(benchmark::State &state, CSAFunc VecFn, + CSAFunc NoVecFn, T Threshold) { + std::unique_ptr A(new T[ITERATIONS]); + std::unique_ptr B(new T[ITERATIONS]); + std::unique_ptr C(new T[ITERATIONS]); + init_data(&A[0], &B[0], &C[0]); + +#ifdef BENCH_AND_VERIFY + // Verify the vectorized and un-vectorized versions produce the same results. + { + T VecRes = VecFn(&A[0], &B[0], &C[0], Threshold); + T NoVecRes = NoVecFn(&A[0], &B[0], &C[0], Threshold); + // We're only interested in whether the conditional assignment results + // were the same. + if (VecRes != NoVecRes) { + std::cerr << "ERROR: autovec result different to scalar result; " + << VecRes << " != " << NoVecRes << "\n"; + exit(1); + } + } +#endif + + for (auto _ : state) { + VecFn(&A[0], &B[0], &C[0], Threshold); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + } +} + +// Benchmark version with vectorization disabled. +template +static void __attribute__((always_inline)) +benchmark_csa_novec(benchmark::State &state, CSAFunc NoVecFn, T Threshold) { + std::unique_ptr A(new T[ITERATIONS]); + std::unique_ptr B(new T[ITERATIONS]); + std::unique_ptr C(new T[ITERATIONS]); + init_data(&A[0], &B[0], &C[0]); + + for (auto _ : state) { + NoVecFn(&A[0], &B[0], &C[0], Threshold); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + } +} + +// Add benchmarks with and without auto-vectorization +#define ADD_BENCHMARK(ty, Threshold) \ + void BENCHMARK_single_csa_only_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_single_csa_only_autovec, \ + run_single_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_single_csa_only_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + \ + void BENCHMARK_single_csa_only_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_single_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_single_csa_only_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + void BENCHMARK_multi_csa_only_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_multi_csa_only_autovec, \ + run_multi_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_multi_csa_only_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + \ + void BENCHMARK_multi_csa_only_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_multi_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_multi_csa_only_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + void BENCHMARK_csa_with_arith_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_csa_with_arith_autovec, \ + run_csa_with_arith_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_csa_with_arith_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + \ + void BENCHMARK_csa_with_arith_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_csa_with_arith_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_csa_with_arith_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); + +ADD_BENCHMARK(int32_t, 75) +ADD_BENCHMARK(uint8_t, 90) +ADD_BENCHMARK(int64_t, 60)