[AArch64] Sink fneg instruction to unlock fmls combine #172000

SavchenkoValeriy · 2025-12-12T12:36:41Z

Sink fneg operands of vector fma intrinsics to enable the fmls combine.

llvmbot · 2025-12-12T12:37:18Z

@llvm/pr-subscribers-backend-aarch64

Author: Valeriy Savchenko (SavchenkoValeriy)

Changes

Sink fneg operands of vector fmul intrinsics to enable the fmls (fused multiply-subtract) combine in the backend.

Full diff: https://github.com/llvm/llvm-project/pull/172000.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5)
(added) llvm/test/CodeGen/AArch64/sink-fneg.ll (+108)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..1b3cfdc2a580e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6585,6 +6585,11 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
           cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
           !ST->hasFullFP16())
         return false;
+      for (unsigned I = 0; I < 2; ++I) {
+        // Sinking fnegs will unlock fmls combine pattern
+        if (match(II->getOperand(I), m_FNeg(m_Value())))
+          Ops.push_back(&II->getOperandUse(I));
+      }
       [[fallthrough]];
     case Intrinsic::aarch64_neon_sqdmull:
     case Intrinsic::aarch64_neon_sqdmulh:
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
new file mode 100644
index 0000000000000..14a317c4ce3c9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB0_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB1_2: // %exit
+; CHECK-NEXT:    ret
+                                     <4 x float> %a, <4 x float> %b,
+                                     i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %negx = fneg <4 x float> %x
+  %nega = fneg <4 x float> %a
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fneg v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %other_use
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q1, [sp] // 16-byte Spill
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+                                                 <4 x float> %a, <4 x float> %b,
+                                                 i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+other_use:
+  call void @foo(<4 x float> %neg)
+  br label %exit
+
+exit:
+  ret <4 x float> %neg
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)

davemgreen

Sounds good. Should we do this for fmul fast too?

llvm/test/CodeGen/AArch64/sink-fneg.ll

SavchenkoValeriy · 2025-12-15T14:37:59Z

Sounds good. Should we do this for fmul fast too?

@davemgreen I put it together for fmul as well, but the implementation is a bit awkward because I needed a "fallthrough" from scalar into vector case, so I couldn't do early return and block out other sink candidates. In the end, I had to replace a couple of return false with return !Ops.empty(). Please see if this makes sense, or it's too brittle and we need something different for fmul.

SavchenkoValeriy requested review from RKSimon, aemerson, davemgreen and jroelofs December 12, 2025 12:36

llvmbot added the backend:AArch64 label Dec 12, 2025

SavchenkoValeriy force-pushed the feat/sink-fneg branch from c753ffa to 80c4696 Compare December 12, 2025 12:56

jroelofs approved these changes Dec 12, 2025

View reviewed changes

davemgreen reviewed Dec 13, 2025

View reviewed changes

llvm/test/CodeGen/AArch64/sink-fneg.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AArch64/sink-fneg.ll Outdated Show resolved Hide resolved

SavchenkoValeriy added 2 commits December 15, 2025 14:33

[AArch64][NFC] Add test for suboptimal fmls combine

4029b8c

[AArch64] Sink fneg instruction to unlock fmls combine

181331d

SavchenkoValeriy force-pushed the feat/sink-fneg branch from 80c4696 to 181331d Compare December 15, 2025 14:34

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Sink fneg instruction to unlock fmls combine #172000

[AArch64] Sink fneg instruction to unlock fmls combine #172000

SavchenkoValeriy commented Dec 12, 2025 •

edited

Loading

Uh oh!

llvmbot commented Dec 12, 2025

Uh oh!

davemgreen left a comment

Uh oh!

Uh oh!

Uh oh!

SavchenkoValeriy commented Dec 15, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[AArch64] Sink fneg instruction to unlock fmls combine #172000

Are you sure you want to change the base?

[AArch64] Sink fneg instruction to unlock fmls combine #172000

Conversation

SavchenkoValeriy commented Dec 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Dec 12, 2025

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

SavchenkoValeriy commented Dec 15, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

SavchenkoValeriy commented Dec 12, 2025 •

edited

Loading