From 489b92e181662e64ad5fc0e4dad4127721574030 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Thu, 13 Nov 2025 13:36:25 +0000
Subject: [PATCH 1/3] [Microbenchmarks] Add benchmark for conditional scalar
 assignment autovec

Benchmarks with vs. without autovec for a loop containing conditional
scalar assignment (plus a little extra arithmetic as a 'work payload').
---
 .../LoopVectorization/CMakeLists.txt          |   1 +
 .../ConditionalScalarAssignment.cpp           | 118 ++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
index 03e84ad800..6d5f0c7019 100644
--- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt
+++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
@@ -10,6 +10,7 @@ endif()
 llvm_test_run()
 
 llvm_test_executable(LoopVectorizationBenchmarks
+  ConditionalScalarAssignment.cpp
   main.cpp
   MathFunctions.cpp
   RuntimeChecks.cpp
diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
new file mode 100644
index 0000000000..c22cebc49d
--- /dev/null
+++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
@@ -0,0 +1,118 @@
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <optional>
+#include <random>
+
+#include "benchmark/benchmark.h"
+
+#define ITERATIONS 10000000
+
+// Find the last element in A above the given threshold,
+// with default loop vectorization settings.
+template <typename T> static T run_csa_autovec(T *A, T *B, T *C, T Threshold) {
+  // Pick out-of-range default value.
+  T Result = 101;
+  for (unsigned i = 0; i < ITERATIONS; i++) {
+    // Do some work to make the difference noticeable
+    C[i] = A[i] * 13 + B[i] * 5;
+    if (A[i] > Threshold)
+      Result = A[i];
+  }
+
+  return Result;
+}
+
+// Find the last element in A above the given threshold,
+// with loop vectorization disabled.
+template <typename T> static T run_csa_novec(T *A, T *B, T* C, T Threshold) {
+  // Pick out-of-range default value.
+  T Result = 101;
+#pragma clang loop vectorize(disable) interleave(disable)
+  for (unsigned i = 0; i < ITERATIONS; i++) {
+    // Do some work to make the difference noticeable
+    C[i] = A[i] * 13 + B[i] * 5;
+    if (A[i] > Threshold)
+      Result = A[i];
+  }
+
+  return Result;
+}
+
+// Initialize arrays A and B with random numbers, and zero array C
+template <typename T> static void init_data(T *A, T* B, T *C) {
+  std::uniform_int_distribution<T> dist(0, 100);
+  std::mt19937 rng(12345);
+  for (unsigned i = 0; i < ITERATIONS; i++) {
+    A[i] = dist(rng);
+    B[i] = dist(rng);
+    C[i] = 0;
+  }
+}
+
+// Benchmark auto-vectorized version using Fn.
+template <typename T>
+static void __attribute__((always_inline))
+benchmark_csa_autovec(benchmark::State &state, T Threshold) {
+  std::unique_ptr<T[]> A(new T[ITERATIONS]);
+  std::unique_ptr<T[]> B(new T[ITERATIONS]);
+  std::unique_ptr<T[]> C(new T[ITERATIONS]);
+  init_data(&A[0], &B[0], &C[0]);
+
+#ifdef BENCH_AND_VERIFY
+  // Verify the vectorized and un-vectorized versions produce the same results.
+  {
+    T VecRes = run_csa_novec(&A[0], &B[0], &C[0], Threshold);
+    T NoVecRes = run_csa_autovec(&A[0], &B[0], &C[0], Threshold);
+    // We're only interested in whether the conditional assignment results
+    // were the same.
+    if (VecRes != NoVecRes) {
+      std::cerr << "ERROR: autovec result different to scalar result; "
+                << VecRes << " != " << NoVecRes << "\n";
+      exit(1);
+    }
+  }
+#endif
+
+  for (auto _ : state) {
+    run_csa_autovec(&A[0], &B[0], &C[0], Threshold);
+    benchmark::DoNotOptimize(A);
+    benchmark::DoNotOptimize(B);
+    benchmark::DoNotOptimize(C);
+    benchmark::ClobberMemory();
+  }
+}
+
+// Benchmark version using Fn with vectorization disabled.
+template <typename T>
+static void __attribute__((always_inline))
+benchmark_csa_novec(benchmark::State &state, T Threshold) {
+  std::unique_ptr<T[]> A(new T[ITERATIONS]);
+  std::unique_ptr<T[]> B(new T[ITERATIONS]);
+  std::unique_ptr<T[]> C(new T[ITERATIONS]);
+  init_data(&A[0], &B[0], &C[0]);
+
+  for (auto _ : state) {
+    run_csa_novec(&A[0], &B[0], &C[0], Threshold);
+    benchmark::DoNotOptimize(A);
+    benchmark::DoNotOptimize(B);
+    benchmark::DoNotOptimize(C);
+  }
+}
+
+// Add add auto-vectorized and disabled vectorization benchmarks for math
+// function fn and type ty.
+#define ADD_BENCHMARK(ty, Threshold)                                           \
+  void BENCHMARK_csa_autovec_##ty##_(benchmark::State &state) {                \
+    benchmark_csa_autovec<ty>(state, Threshold);                               \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_csa_autovec_##ty##_)->Unit(benchmark::kMicrosecond);     \
+                                                                               \
+  void BENCHMARK_csa_novec_##ty##_(benchmark::State &state) {                  \
+    benchmark_csa_novec<ty>(state, Threshold);                                 \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_csa_novec_##ty##_)->Unit(benchmark::kMicrosecond);
+
+ADD_BENCHMARK(int32_t, 75)
+ADD_BENCHMARK(uint8_t, 90)
+ADD_BENCHMARK(int64_t, 60)

From 03848a1a6a99c6ead30e13e03e18dcd7b85017ea Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Thu, 13 Nov 2025 15:08:31 +0000
Subject: [PATCH 2/3] Remove unnecessary headers, improve comments

---
 .../LoopVectorization/ConditionalScalarAssignment.cpp     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
index c22cebc49d..75bf638d89 100644
--- a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
+++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
@@ -1,7 +1,5 @@
 #include <iostream>
-#include <math.h>
 #include <memory>
-#include <optional>
 #include <random>
 
 #include "benchmark/benchmark.h"
@@ -11,7 +9,8 @@
 // Find the last element in A above the given threshold,
 // with default loop vectorization settings.
 template <typename T> static T run_csa_autovec(T *A, T *B, T *C, T Threshold) {
-  // Pick out-of-range default value.
+  // Pick a default value that's out of the uniform distribution created
+  // for 'A' in init_data below.
   T Result = 101;
   for (unsigned i = 0; i < ITERATIONS; i++) {
     // Do some work to make the difference noticeable
@@ -26,7 +25,8 @@ template <typename T> static T run_csa_autovec(T *A, T *B, T *C, T Threshold) {
 // Find the last element in A above the given threshold,
 // with loop vectorization disabled.
 template <typename T> static T run_csa_novec(T *A, T *B, T* C, T Threshold) {
-  // Pick out-of-range default value.
+  // Pick a default value that's out of the uniform distribution created
+  // for 'A' in init_data below.
   T Result = 101;
 #pragma clang loop vectorize(disable) interleave(disable)
   for (unsigned i = 0; i < ITERATIONS; i++) {

From bc3492f421b2280d229658dd81ae61bbea760a55 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Fri, 14 Nov 2025 13:45:51 +0000
Subject: [PATCH 3/3] Add single-csa-only and multi-csa-only variants, tidy up

---
 .../ConditionalScalarAssignment.cpp           | 154 +++++++++++++++---
 1 file changed, 129 insertions(+), 25 deletions(-)

diff --git a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
index 75bf638d89..9470465855 100644
--- a/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
+++ b/MicroBenchmarks/LoopVectorization/ConditionalScalarAssignment.cpp
@@ -4,13 +4,89 @@
 
 #include "benchmark/benchmark.h"
 
-#define ITERATIONS 10000000
+#define ITERATIONS 100000
+
+template <typename T>
+using CSAFunc = T (*)(T *, T *, T *, T);
+
+// Find the last element in A above the given threshold,
+// with default loop vectorization settings.
+template <typename T>
+static T run_single_csa_only_autovec(T *A, T *B, T *C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A' in init_data below.
+  T Result = 101;
+  for (unsigned i = 0; i < ITERATIONS; i++)
+    if (A[i] > Threshold)
+      Result = A[i];
+
+  return Result;
+}
+
+// Find the last element in A above the given threshold,
+// with loop vectorization disabled.
+template <typename T>
+static T run_single_csa_only_novec(T *A, T *B, T *C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A' in init_data below.
+  T Result = 101;
+#pragma clang loop vectorize(disable) interleave(disable)
+  for (unsigned i = 0; i < ITERATIONS; i++)
+    if (A[i] > Threshold)
+      Result = A[i];
+
+  return Result;
+}
+
+// Find the last elements in A, B, and C above the given threshold,
+// with default loop vectorization settings.
+template <typename T>
+static T run_multi_csa_only_autovec(T *A, T *B, T *C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A', 'B', and 'C' in init_data below.
+  T ResultA = 101;
+  T ResultB = 101;
+  T ResultC = 101;
+  for (unsigned i = 0; i < ITERATIONS; i++) {
+    if (A[i] > Threshold)
+      ResultA = A[i];
+    if (B[i] > Threshold)
+      ResultB = B[i];
+    if (C[i] > Threshold)
+      ResultC = C[i];
+  }
+
+  return ResultA ^ ResultB ^ ResultC;
+}
+
+// Find the last elements in A, B, and C above the given threshold,
+// with loop vectorization disabled.
+template <typename T>
+static T run_multi_csa_only_novec(T *A, T *B, T *C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A', 'B', and 'C' in init_data below.
+  T ResultA = 101;
+  T ResultB = 101;
+  T ResultC = 101;
+#pragma clang loop vectorize(disable) interleave(disable)
+  for (unsigned i = 0; i < ITERATIONS; i++) {
+    if (A[i] > Threshold)
+      ResultA = A[i];
+    if (B[i] > Threshold)
+      ResultB = B[i];
+    if (C[i] > Threshold)
+      ResultC = C[i];
+  }
+
+  return ResultA ^ ResultB ^ ResultC;
+}
 
 // Find the last element in A above the given threshold,
 // with default loop vectorization settings.
-template <typename T> static T run_csa_autovec(T *A, T *B, T *C, T Threshold) {
-  // Pick a default value that's out of the uniform distribution created
-  // for 'A' in init_data below.
+template <typename T>
+static T run_csa_with_arith_autovec(T *A, T *B, T *C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A' in init_data below.
   T Result = 101;
   for (unsigned i = 0; i < ITERATIONS; i++) {
     // Do some work to make the difference noticeable
@@ -24,9 +100,10 @@ template <typename T> static T run_csa_autovec(T *A, T *B, T *C, T Threshold) {
 
 // Find the last element in A above the given threshold,
 // with loop vectorization disabled.
-template <typename T> static T run_csa_novec(T *A, T *B, T* C, T Threshold) {
-  // Pick a default value that's out of the uniform distribution created
-  // for 'A' in init_data below.
+template <typename T>
+static T run_csa_with_arith_novec(T *A, T *B, T* C, T Threshold) {
+  // Pick a default value that's out of range of the uniform distribution
+  // created for 'A' in init_data below.
   T Result = 101;
 #pragma clang loop vectorize(disable) interleave(disable)
   for (unsigned i = 0; i < ITERATIONS; i++) {
@@ -39,21 +116,22 @@ template <typename T> static T run_csa_novec(T *A, T *B, T* C, T Threshold) {
   return Result;
 }
 
-// Initialize arrays A and B with random numbers, and zero array C
+// Initialize arrays A, B, and C with random numbers
 template <typename T> static void init_data(T *A, T* B, T *C) {
   std::uniform_int_distribution<T> dist(0, 100);
   std::mt19937 rng(12345);
   for (unsigned i = 0; i < ITERATIONS; i++) {
     A[i] = dist(rng);
     B[i] = dist(rng);
-    C[i] = 0;
+    C[i] = dist(rng);
   }
 }
 
-// Benchmark auto-vectorized version using Fn.
+// Benchmark auto-vectorized version.
 template <typename T>
 static void __attribute__((always_inline))
-benchmark_csa_autovec(benchmark::State &state, T Threshold) {
+benchmark_csa_autovec(benchmark::State &state, CSAFunc<T> VecFn,
+                      CSAFunc<T> NoVecFn, T Threshold) {
   std::unique_ptr<T[]> A(new T[ITERATIONS]);
   std::unique_ptr<T[]> B(new T[ITERATIONS]);
   std::unique_ptr<T[]> C(new T[ITERATIONS]);
@@ -62,8 +140,8 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) {
 #ifdef BENCH_AND_VERIFY
   // Verify the vectorized and un-vectorized versions produce the same results.
   {
-    T VecRes = run_csa_novec(&A[0], &B[0], &C[0], Threshold);
-    T NoVecRes = run_csa_autovec(&A[0], &B[0], &C[0], Threshold);
+    T VecRes = VecFn(&A[0], &B[0], &C[0], Threshold);
+    T NoVecRes = NoVecFn(&A[0], &B[0], &C[0], Threshold);
     // We're only interested in whether the conditional assignment results
     // were the same.
     if (VecRes != NoVecRes) {
@@ -75,7 +153,7 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) {
 #endif
 
   for (auto _ : state) {
-    run_csa_autovec(&A[0], &B[0], &C[0], Threshold);
+    VecFn(&A[0], &B[0], &C[0], Threshold);
     benchmark::DoNotOptimize(A);
     benchmark::DoNotOptimize(B);
     benchmark::DoNotOptimize(C);
@@ -83,35 +161,61 @@ benchmark_csa_autovec(benchmark::State &state, T Threshold) {
   }
 }
 
-// Benchmark version using Fn with vectorization disabled.
+// Benchmark version with vectorization disabled.
 template <typename T>
 static void __attribute__((always_inline))
-benchmark_csa_novec(benchmark::State &state, T Threshold) {
+benchmark_csa_novec(benchmark::State &state, CSAFunc<T> NoVecFn, T Threshold) {
   std::unique_ptr<T[]> A(new T[ITERATIONS]);
   std::unique_ptr<T[]> B(new T[ITERATIONS]);
   std::unique_ptr<T[]> C(new T[ITERATIONS]);
   init_data(&A[0], &B[0], &C[0]);
 
   for (auto _ : state) {
-    run_csa_novec(&A[0], &B[0], &C[0], Threshold);
+    NoVecFn(&A[0], &B[0], &C[0], Threshold);
     benchmark::DoNotOptimize(A);
     benchmark::DoNotOptimize(B);
     benchmark::DoNotOptimize(C);
   }
 }
 
-// Add add auto-vectorized and disabled vectorization benchmarks for math
-// function fn and type ty.
+// Add benchmarks with and without auto-vectorization
 #define ADD_BENCHMARK(ty, Threshold)                                           \
-  void BENCHMARK_csa_autovec_##ty##_(benchmark::State &state) {                \
-    benchmark_csa_autovec<ty>(state, Threshold);                               \
+  void BENCHMARK_single_csa_only_autovec_##ty##_(benchmark::State &state) {    \
+    benchmark_csa_autovec<ty>(state, run_single_csa_only_autovec,              \
+                              run_single_csa_only_novec, Threshold);           \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_single_csa_only_autovec_##ty##_)->Unit(                  \
+                                                      benchmark::kNanosecond); \
+                                                                               \
+  void BENCHMARK_single_csa_only_novec_##ty##_(benchmark::State &state) {      \
+    benchmark_csa_novec<ty>(state, run_single_csa_only_novec, Threshold);      \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_single_csa_only_novec_##ty##_)->Unit(                    \
+                                                      benchmark::kNanosecond); \
+  void BENCHMARK_multi_csa_only_autovec_##ty##_(benchmark::State &state) {     \
+    benchmark_csa_autovec<ty>(state, run_multi_csa_only_autovec,               \
+                              run_multi_csa_only_novec, Threshold);            \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_multi_csa_only_autovec_##ty##_)->Unit(                   \
+                                                      benchmark::kNanosecond); \
+                                                                               \
+  void BENCHMARK_multi_csa_only_novec_##ty##_(benchmark::State &state) {       \
+    benchmark_csa_novec<ty>(state, run_multi_csa_only_novec, Threshold);       \
+  }                                                                            \
+  BENCHMARK(BENCHMARK_multi_csa_only_novec_##ty##_)->Unit(                     \
+                                                      benchmark::kNanosecond); \
+  void BENCHMARK_csa_with_arith_autovec_##ty##_(benchmark::State &state) {     \
+    benchmark_csa_autovec<ty>(state, run_csa_with_arith_autovec,               \
+                              run_csa_with_arith_novec, Threshold);            \
   }                                                                            \
-  BENCHMARK(BENCHMARK_csa_autovec_##ty##_)->Unit(benchmark::kMicrosecond);     \
+  BENCHMARK(BENCHMARK_csa_with_arith_autovec_##ty##_)->Unit(                   \
+                                                      benchmark::kNanosecond); \
                                                                                \
-  void BENCHMARK_csa_novec_##ty##_(benchmark::State &state) {                  \
-    benchmark_csa_novec<ty>(state, Threshold);                                 \
+  void BENCHMARK_csa_with_arith_novec_##ty##_(benchmark::State &state) {       \
+    benchmark_csa_novec<ty>(state, run_csa_with_arith_novec, Threshold);       \
   }                                                                            \
-  BENCHMARK(BENCHMARK_csa_novec_##ty##_)->Unit(benchmark::kMicrosecond);
+  BENCHMARK(BENCHMARK_csa_with_arith_novec_##ty##_)->Unit(                     \
+                                                        benchmark::kNanosecond);
 
 ADD_BENCHMARK(int32_t, 75)
 ADD_BENCHMARK(uint8_t, 90)