Skip to content

Conversation

@guy-david
Copy link
Contributor

@guy-david guy-david commented Nov 10, 2025

Requires #164503.

FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed when one element (usually the 4th) or more are known to be zero.

Before:

movi d1, #0000000000000000
mov v0.s[3], v1.s[0]
faddp v0.4s, v0.4s, v0.4s
faddp s0, v0.2s

After:

mov s1, v0.s[2]
faddp s0, v0.2s
fadd s0, s0, s1

When all of the elements are zero, the intrinsic now simply reduces into a constant instead of emitting two additions.

@llvmbot
Copy link
Member

llvmbot commented Nov 10, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Guy David (guy-david)

Changes

FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed when one element (usually the 4th) or more are known to be zero.

Before:

movi d1, #<!-- -->0000000000000000
mov v0.s[3], v1.s[0]
faddp v0.4s, v0.4s, v0.4s
faddp s0, v0.2s

After:

mov s1, v0.s[2]
faddp s0, v0.2s
fadd s0, s0, s1

When all of the elements are zero, the intrinsic now simply reduces into a constant instead of emitting two additions.


Full diff: https://github.com/llvm/llvm-project/pull/167313.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+58)
  • (added) llvm/test/CodeGen/AArch64/faddv.ll (+82)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 132afc27135e9..b4bf97e27bca4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22790,6 +22790,62 @@ static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   }
 }
 
+/// Optimize patterns where we insert zeros into vector lanes before faddv.
+static SDValue tryCombineFADDVWithZero(SDNode *N, SelectionDAG &DAG) {
+  assert(getIntrinsicID(N) == Intrinsic::aarch64_neon_faddv &&
+         "Expected NEON faddv intrinsic");
+  SDLoc DL(N);
+  SDValue Vec = N->getOperand(1);
+  EVT VT = Vec.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  APInt DemandedElts = APInt::getAllOnes(NumElts);
+  APInt KnownZeroElts = DAG.computeVectorKnownZeroElements(Vec, DemandedElts);
+  unsigned NumZeroElts = KnownZeroElts.popcount();
+  // No element is known to be +0.0, fallback to the TableGen pattern.
+  if (NumZeroElts == 0)
+    return SDValue();
+  // All elements are +0.0, just return zero.
+  if (NumZeroElts == NumElts)
+    return DAG.getConstantFP(0.0, DL, EltVT);
+
+  // At least one element is +0.0, so it is worth to decompose the reduction
+  // into fadd's. FADDV is a pairwise reduction, so we need to respect the
+  // order of the elements in the vector.
+
+  // Check if we can output a signed zero.
+  // This avoid the scenario where all the added values are -0.0 except the +0.0
+  // element we chose to ignore.
+  SDNodeFlags Flags = N->getFlags();
+  bool IsSignedZeroSafe = Flags.hasNoSignedZeros() ||
+                          DAG.allUsesSignedZeroInsensitive(SDValue(N, 0));
+  if (!IsSignedZeroSafe)
+    return SDValue();
+
+  // Extract all elements.
+  SmallVector<SDValue, 4> Elts;
+  for (unsigned I = 0; I < NumElts; I++) {
+    Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
+                               DAG.getConstant(I, DL, MVT::i64)));
+  }
+  // Perform pairwise reduction.
+  while (Elts.size() > 1) {
+    SmallVector<SDValue, 2> NewElts;
+    for (unsigned I = 0; I < Elts.size(); I += 2) {
+      if (!KnownZeroElts[I] && !KnownZeroElts[I + 1]) {
+        NewElts.push_back(
+            DAG.getNode(ISD::FADD, DL, EltVT, Elts[I], Elts[I + 1]));
+      } else if (KnownZeroElts[I]) {
+        NewElts.push_back(Elts[I + 1]);
+      } else if (KnownZeroElts[I + 1]) {
+        NewElts.push_back(Elts[I]);
+      }
+    }
+    Elts = std::move(NewElts);
+  }
+  return Elts[0];
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -22813,6 +22869,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
   case Intrinsic::aarch64_neon_umaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
+  case Intrinsic::aarch64_neon_faddv:
+    return tryCombineFADDVWithZero(N, DAG);
   case Intrinsic::aarch64_neon_fmax:
     return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
diff --git a/llvm/test/CodeGen/AArch64/faddv.ll b/llvm/test/CodeGen/AArch64/faddv.ll
new file mode 100644
index 0000000000000..e4a3781150cf7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/faddv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+; Test element at index 0 is zero.
+define float @test_v2f32_element_0_zero(<2 x float> %vec) {
+; CHECK-LABEL: test_v2f32_element_0_zero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
+  %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %with_zero)
+  ret float %sum
+}
+
+; Test element at index 3 is zero.
+define float @test_v4f32_element_3_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_element_3_zero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fabs s0, s0
+; CHECK-NEXT:    ret
+entry:
+  %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
+  %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
+  %abs = call float @llvm.fabs.f32(float %sum)
+  ret float %abs
+}
+
+; Test elements at index 0 and 2 are zero.
+define float @test_v4f32_elements_0_2_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_elements_0_2_zero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fabs s0, s0
+; CHECK-NEXT:    ret
+entry:
+  %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
+  %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
+  %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero2)
+  %abs = call float @llvm.fabs.f32(float %sum)
+  ret float %abs
+}
+
+; Test all elements are zero.
+define float @test_v4f32_all_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_all_zero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
+  %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
+  %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
+  %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
+  %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero4)
+  ret float %sum
+}
+
+; Test element at index 0 is zero.
+define double @test_v2f64_element_0_zero(<2 x double> %vec) {
+; CHECK-LABEL: test_v2f64_element_0_zero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    fabs d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
+  %sum = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %with_zero)
+  %abs = call double @llvm.fabs.f64(double %sum)
+  ret double %abs
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double>)

@guy-david guy-david force-pushed the users/guy-david/dag-combine-sign-insensitive branch from 44ed78d to 5148b0d Compare November 12, 2025 09:06
FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed
when one element (usually the 4th) or more are known to be zero.

Before:
movi d1, #0000000000000000
mov v0.s[3], v1.s[0]
faddp v0.4s, v0.4s, v0.4s
faddp s0, v0.2s

After:
mov s1, v0.s[2]
faddp s0, v0.2s
fadd s0, s0, s1
@guy-david guy-david force-pushed the users/guy-david/aarch64-decompose-faddv branch from 7a1406e to 5a1dc0b Compare November 12, 2025 09:07
@guy-david guy-david changed the base branch from users/guy-david/dag-combine-sign-insensitive to users/guy-david/dag-combine-fp-to-int-to-fp-sign-insensitive November 12, 2025 09:07
@davemgreen
Copy link
Collaborator

We shouldn't really have two lowerings for aarch64.neon.faddv and llvm.vector.reduce.fadd, but should this apply to both?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants