Skip to content

Conversation

@SavchenkoValeriy
Copy link
Member

@SavchenkoValeriy SavchenkoValeriy commented Dec 12, 2025

Sink fneg operands of vector fma intrinsics to enable the fmls combine.

@llvmbot
Copy link
Member

llvmbot commented Dec 12, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Valeriy Savchenko (SavchenkoValeriy)

Changes

Sink fneg operands of vector fmul intrinsics to enable the fmls (fused multiply-subtract) combine in the backend.


Full diff: https://github.com/llvm/llvm-project/pull/172000.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5)
  • (added) llvm/test/CodeGen/AArch64/sink-fneg.ll (+108)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..1b3cfdc2a580e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6585,6 +6585,11 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
           cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
           !ST->hasFullFP16())
         return false;
+      for (unsigned I = 0; I < 2; ++I) {
+        // Sinking fnegs will unlock fmls combine pattern
+        if (match(II->getOperand(I), m_FNeg(m_Value())))
+          Ops.push_back(&II->getOperandUse(I));
+      }
       [[fallthrough]];
     case Intrinsic::aarch64_neon_sqdmull:
     case Intrinsic::aarch64_neon_sqdmulh:
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
new file mode 100644
index 0000000000000..14a317c4ce3c9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB0_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB1_2: // %exit
+; CHECK-NEXT:    ret
+                                     <4 x float> %a, <4 x float> %b,
+                                     i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %negx = fneg <4 x float> %x
+  %nega = fneg <4 x float> %a
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fneg v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %other_use
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q1, [sp] // 16-byte Spill
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+                                                 <4 x float> %a, <4 x float> %b,
+                                                 i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+other_use:
+  call void @foo(<4 x float> %neg)
+  br label %exit
+
+exit:
+  ret <4 x float> %neg
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)

Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Should we do this for fmul fast too?

@SavchenkoValeriy
Copy link
Member Author

Sounds good. Should we do this for fmul fast too?

@davemgreen I put it together for fmul as well, but the implementation is a bit awkward because I needed a "fallthrough" from scalar into vector case, so I couldn't do early return and block out other sink candidates. In the end, I had to replace a couple of return false with return !Ops.empty(). Please see if this makes sense, or it's too brittle and we need something different for fmul.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants