-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AArch64] Sink fneg instruction to unlock fmls combine #172000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Valeriy Savchenko (SavchenkoValeriy) ChangesSink fneg operands of vector fmul intrinsics to enable the fmls (fused multiply-subtract) combine in the backend. Full diff: https://github.com/llvm/llvm-project/pull/172000.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..1b3cfdc2a580e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6585,6 +6585,11 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
!ST->hasFullFP16())
return false;
+ for (unsigned I = 0; I < 2; ++I) {
+ // Sinking fnegs will unlock fmls combine pattern
+ if (match(II->getOperand(I), m_FNeg(m_Value())))
+ Ops.push_back(&II->getOperandUse(I));
+ }
[[fallthrough]];
case Intrinsic::aarch64_neon_sqdmull:
case Intrinsic::aarch64_neon_sqdmulh:
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
new file mode 100644
index 0000000000000..14a317c4ce3c9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB0_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: .LBB0_2: // %exit
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %neg = fneg <4 x float> %x
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+exit:
+ ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: .LBB1_2: // %exit
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %negx = fneg <4 x float> %x
+ %nega = fneg <4 x float> %a
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+exit:
+ ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: fneg v1.4s, v0.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB2_2: // %other_use
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: str q1, [sp] // 16-byte Spill
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %neg = fneg <4 x float> %x
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+other_use:
+ call void @foo(<4 x float> %neg)
+ br label %exit
+
+exit:
+ ret <4 x float> %neg
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
|
c753ffa to
80c4696
Compare
davemgreen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. Should we do this for fmul fast too?
80c4696 to
181331d
Compare
@davemgreen I put it together for |
Sink
fnegoperands of vectorfmaintrinsics to enable thefmlscombine.