From 489b92e181662e64ad5fc0e4dad4127721574030 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 13 Nov 2025 13:36:25 +0000 Subject: [PATCH 1/3] [Microbenchmarks] Add benchmark for conditional scalar assignment autovec Benchmarks with vs. without autovec for a loop containing conditional scalar assignment (plus a little extra arithmetic as a 'work payload'). --- .../LoopVectorization/CMakeLists.txt | 1 + .../ConditionalScalarAssignment.cpp | 118 ++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt index 03e84ad800..6d5f0c7019 100644 --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -10,6 +10,7 @@ endif() llvm_test_run() llvm_test_executable(LoopVectorizationBenchmarks + ConditionalScalarAssignment.cpp main.cpp MathFunctions.cpp RuntimeChecks.cpp diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp new file mode 100644 index 0000000000..c22cebc49d --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ITERATIONS 10000000 + +// Find the last element in A above the given threshold, +// with default loop vectorization settings. +template static T run_csa_autovec(T *A, T *B, T *C, T Threshold) { + // Pick out-of-range default value. + T Result = 101; + for (unsigned i = 0; i < ITERATIONS; i++) { + // Do some work to make the difference noticeable + C[i] = A[i] * 13 + B[i] * 5; + if (A[i] > Threshold) + Result = A[i]; + } + + return Result; +} + +// Find the last element in A above the given threshold, +// with loop vectorization disabled. +template static T run_csa_novec(T *A, T *B, T* C, T Threshold) { + // Pick out-of-range default value. + T Result = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) { + // Do some work to make the difference noticeable + C[i] = A[i] * 13 + B[i] * 5; + if (A[i] > Threshold) + Result = A[i]; + } + + return Result; +} + +// Initialize arrays A and B with random numbers, and zero array C +template static void init_data(T *A, T* B, T *C) { + std::uniform_int_distribution dist(0, 100); + std::mt19937 rng(12345); + for (unsigned i = 0; i < ITERATIONS; i++) { + A[i] = dist(rng); + B[i] = dist(rng); + C[i] = 0; + } +} + +// Benchmark auto-vectorized version using Fn. +template +static void __attribute__((always_inline)) +benchmark_csa_autovec(benchmark::State &state, T Threshold) { + std::unique_ptr A(new T[ITERATIONS]); + std::unique_ptr B(new T[ITERATIONS]); + std::unique_ptr C(new T[ITERATIONS]); + init_data(&A[0], &B[0], &C[0]); + +#ifdef BENCH_AND_VERIFY + // Verify the vectorized and un-vectorized versions produce the same results. + { + T VecRes = run_csa_novec(&A[0], &B[0], &C[0], Threshold); + T NoVecRes = run_csa_autovec(&A[0], &B[0], &C[0], Threshold); + // We're only interested in whether the conditional assignment results + // were the same. + if (VecRes != NoVecRes) { + std::cerr << "ERROR: autovec result different to scalar result; " + << VecRes << " != " << NoVecRes << "\n"; + exit(1); + } + } +#endif + + for (auto _ : state) { + run_csa_autovec(&A[0], &B[0], &C[0], Threshold); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + } +} + +// Benchmark version using Fn with vectorization disabled. +template +static void __attribute__((always_inline)) +benchmark_csa_novec(benchmark::State &state, T Threshold) { + std::unique_ptr A(new T[ITERATIONS]); + std::unique_ptr B(new T[ITERATIONS]); + std::unique_ptr C(new T[ITERATIONS]); + init_data(&A[0], &B[0], &C[0]); + + for (auto _ : state) { + run_csa_novec(&A[0], &B[0], &C[0], Threshold); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + } +} + +// Add add auto-vectorized and disabled vectorization benchmarks for math +// function fn and type ty. +#define ADD_BENCHMARK(ty, Threshold) \ + void BENCHMARK_csa_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, Threshold); \ + } \ + BENCHMARK(BENCHMARK_csa_autovec_##ty##_)->Unit(benchmark::kMicrosecond); \ + \ + void BENCHMARK_csa_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, Threshold); \ + } \ + BENCHMARK(BENCHMARK_csa_novec_##ty##_)->Unit(benchmark::kMicrosecond); + +ADD_BENCHMARK(int32_t, 75) +ADD_BENCHMARK(uint8_t, 90) +ADD_BENCHMARK(int64_t, 60) From 03848a1a6a99c6ead30e13e03e18dcd7b85017ea Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 13 Nov 2025 15:08:31 +0000 Subject: [PATCH 2/3] Remove unnecessary headers, improve comments --- .../LoopVectorization/ConditionalScalarAssignment.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp index c22cebc49d..75bf638d89 100644 --- a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp +++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp @@ -1,7 +1,5 @@ #include -#include #include -#include #include #include "benchmark/benchmark.h" @@ -11,7 +9,8 @@ // Find the last element in A above the given threshold, // with default loop vectorization settings. template static T run_csa_autovec(T *A, T *B, T *C, T Threshold) { - // Pick out-of-range default value. + // Pick a default value that's out of the uniform distribution created + // for 'A' in init_data below. T Result = 101; for (unsigned i = 0; i < ITERATIONS; i++) { // Do some work to make the difference noticeable @@ -26,7 +25,8 @@ template static T run_csa_autovec(T *A, T *B, T *C, T Threshold) { // Find the last element in A above the given threshold, // with loop vectorization disabled. template static T run_csa_novec(T *A, T *B, T* C, T Threshold) { - // Pick out-of-range default value. + // Pick a default value that's out of the uniform distribution created + // for 'A' in init_data below. T Result = 101; #pragma clang loop vectorize(disable) interleave(disable) for (unsigned i = 0; i < ITERATIONS; i++) { From bc3492f421b2280d229658dd81ae61bbea760a55 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 14 Nov 2025 13:45:51 +0000 Subject: [PATCH 3/3] Add single-csa-only and multi-csa-only variants, tidy up --- .../ConditionalScalarAssignment.cpp | 154 +++++++++++++++--- 1 file changed, 129 insertions(+), 25 deletions(-) diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp index 75bf638d89..9470465855 100644 --- a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp +++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp @@ -4,13 +4,89 @@ #include "benchmark/benchmark.h" -#define ITERATIONS 10000000 +#define ITERATIONS 100000 + +template +using CSAFunc = T (*)(T *, T *, T *, T); + +// Find the last element in A above the given threshold, +// with default loop vectorization settings. +template +static T run_single_csa_only_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; + for (unsigned i = 0; i < ITERATIONS; i++) + if (A[i] > Threshold) + Result = A[i]; + + return Result; +} + +// Find the last element in A above the given threshold, +// with loop vectorization disabled. +template +static T run_single_csa_only_novec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. + T Result = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) + if (A[i] > Threshold) + Result = A[i]; + + return Result; +} + +// Find the last elements in A, B, and C above the given threshold, +// with default loop vectorization settings. +template +static T run_multi_csa_only_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A', 'B', and 'C' in init_data below. + T ResultA = 101; + T ResultB = 101; + T ResultC = 101; + for (unsigned i = 0; i < ITERATIONS; i++) { + if (A[i] > Threshold) + ResultA = A[i]; + if (B[i] > Threshold) + ResultB = B[i]; + if (C[i] > Threshold) + ResultC = C[i]; + } + + return ResultA ^ ResultB ^ ResultC; +} + +// Find the last elements in A, B, and C above the given threshold, +// with loop vectorization disabled. +template +static T run_multi_csa_only_novec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A', 'B', and 'C' in init_data below. + T ResultA = 101; + T ResultB = 101; + T ResultC = 101; +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < ITERATIONS; i++) { + if (A[i] > Threshold) + ResultA = A[i]; + if (B[i] > Threshold) + ResultB = B[i]; + if (C[i] > Threshold) + ResultC = C[i]; + } + + return ResultA ^ ResultB ^ ResultC; +} // Find the last element in A above the given threshold, // with default loop vectorization settings. -template static T run_csa_autovec(T *A, T *B, T *C, T Threshold) { - // Pick a default value that's out of the uniform distribution created - // for 'A' in init_data below. +template +static T run_csa_with_arith_autovec(T *A, T *B, T *C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. T Result = 101; for (unsigned i = 0; i < ITERATIONS; i++) { // Do some work to make the difference noticeable @@ -24,9 +100,10 @@ template static T run_csa_autovec(T *A, T *B, T *C, T Threshold) { // Find the last element in A above the given threshold, // with loop vectorization disabled. -template static T run_csa_novec(T *A, T *B, T* C, T Threshold) { - // Pick a default value that's out of the uniform distribution created - // for 'A' in init_data below. +template +static T run_csa_with_arith_novec(T *A, T *B, T* C, T Threshold) { + // Pick a default value that's out of range of the uniform distribution + // created for 'A' in init_data below. T Result = 101; #pragma clang loop vectorize(disable) interleave(disable) for (unsigned i = 0; i < ITERATIONS; i++) { @@ -39,21 +116,22 @@ template static T run_csa_novec(T *A, T *B, T* C, T Threshold) { return Result; } -// Initialize arrays A and B with random numbers, and zero array C +// Initialize arrays A, B, and C with random numbers template static void init_data(T *A, T* B, T *C) { std::uniform_int_distribution dist(0, 100); std::mt19937 rng(12345); for (unsigned i = 0; i < ITERATIONS; i++) { A[i] = dist(rng); B[i] = dist(rng); - C[i] = 0; + C[i] = dist(rng); } } -// Benchmark auto-vectorized version using Fn. +// Benchmark auto-vectorized version. template static void __attribute__((always_inline)) -benchmark_csa_autovec(benchmark::State &state, T Threshold) { +benchmark_csa_autovec(benchmark::State &state, CSAFunc VecFn, + CSAFunc NoVecFn, T Threshold) { std::unique_ptr A(new T[ITERATIONS]); std::unique_ptr B(new T[ITERATIONS]); std::unique_ptr C(new T[ITERATIONS]); @@ -62,8 +140,8 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) { #ifdef BENCH_AND_VERIFY // Verify the vectorized and un-vectorized versions produce the same results. { - T VecRes = run_csa_novec(&A[0], &B[0], &C[0], Threshold); - T NoVecRes = run_csa_autovec(&A[0], &B[0], &C[0], Threshold); + T VecRes = VecFn(&A[0], &B[0], &C[0], Threshold); + T NoVecRes = NoVecFn(&A[0], &B[0], &C[0], Threshold); // We're only interested in whether the conditional assignment results // were the same. if (VecRes != NoVecRes) { @@ -75,7 +153,7 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) { #endif for (auto _ : state) { - run_csa_autovec(&A[0], &B[0], &C[0], Threshold); + VecFn(&A[0], &B[0], &C[0], Threshold); benchmark::DoNotOptimize(A); benchmark::DoNotOptimize(B); benchmark::DoNotOptimize(C); @@ -83,35 +161,61 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) { } } -// Benchmark version using Fn with vectorization disabled. +// Benchmark version with vectorization disabled. template static void __attribute__((always_inline)) -benchmark_csa_novec(benchmark::State &state, T Threshold) { +benchmark_csa_novec(benchmark::State &state, CSAFunc NoVecFn, T Threshold) { std::unique_ptr A(new T[ITERATIONS]); std::unique_ptr B(new T[ITERATIONS]); std::unique_ptr C(new T[ITERATIONS]); init_data(&A[0], &B[0], &C[0]); for (auto _ : state) { - run_csa_novec(&A[0], &B[0], &C[0], Threshold); + NoVecFn(&A[0], &B[0], &C[0], Threshold); benchmark::DoNotOptimize(A); benchmark::DoNotOptimize(B); benchmark::DoNotOptimize(C); } } -// Add add auto-vectorized and disabled vectorization benchmarks for math -// function fn and type ty. +// Add benchmarks with and without auto-vectorization #define ADD_BENCHMARK(ty, Threshold) \ - void BENCHMARK_csa_autovec_##ty##_(benchmark::State &state) { \ - benchmark_csa_autovec(state, Threshold); \ + void BENCHMARK_single_csa_only_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_single_csa_only_autovec, \ + run_single_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_single_csa_only_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + \ + void BENCHMARK_single_csa_only_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_single_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_single_csa_only_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + void BENCHMARK_multi_csa_only_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_multi_csa_only_autovec, \ + run_multi_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_multi_csa_only_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + \ + void BENCHMARK_multi_csa_only_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_multi_csa_only_novec, Threshold); \ + } \ + BENCHMARK(BENCHMARK_multi_csa_only_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ + void BENCHMARK_csa_with_arith_autovec_##ty##_(benchmark::State &state) { \ + benchmark_csa_autovec(state, run_csa_with_arith_autovec, \ + run_csa_with_arith_novec, Threshold); \ } \ - BENCHMARK(BENCHMARK_csa_autovec_##ty##_)->Unit(benchmark::kMicrosecond); \ + BENCHMARK(BENCHMARK_csa_with_arith_autovec_##ty##_)->Unit( \ + benchmark::kNanosecond); \ \ - void BENCHMARK_csa_novec_##ty##_(benchmark::State &state) { \ - benchmark_csa_novec(state, Threshold); \ + void BENCHMARK_csa_with_arith_novec_##ty##_(benchmark::State &state) { \ + benchmark_csa_novec(state, run_csa_with_arith_novec, Threshold); \ } \ - BENCHMARK(BENCHMARK_csa_novec_##ty##_)->Unit(benchmark::kMicrosecond); + BENCHMARK(BENCHMARK_csa_with_arith_novec_##ty##_)->Unit( \ + benchmark::kNanosecond); ADD_BENCHMARK(int32_t, 75) ADD_BENCHMARK(uint8_t, 90)