-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AArch64] Lower v1i64 and v2i64 [S|U][MIN|MAX] to SVE when available #166735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The predicate is likely to be hoisted, so in a loop, this would result in a single SVE instruction, which should have lower latency.
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThe predicate is likely to be hoisted, so in a loop, this would result in a single SVE instruction, which should have lower latency. Full diff: https://github.com/llvm/llvm-project/pull/166735.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d08f9b94227a2..763d27716a4fa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11330,9 +11330,11 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
break;
}
+ // Note: This lowering is only used for v1i64 and v2i64, where we prefer using
+ // SVE if available.
if (VT.isScalableVector() ||
useSVEForFixedLengthVectorVT(
- VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+ VT, /*OverrideNEON=*/Subtarget->isSVEorStreamingSVEAvailable())) {
switch (Opcode) {
default:
llvm_unreachable("Wrong instruction");
diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll
new file mode 100644
index 0000000000000..6696f94d404c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: smax_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: smax_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ptrue p0.d, vl2
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smin_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: smin_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ptrue p0.d, vl2
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT: smin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: umax_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmhi v2.2d, v0.2d, v1.2d
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: umax_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ptrue p0.d, vl2
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT: umax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umin_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: umin_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ptrue p0.d, vl2
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT: umin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %0
+}
+
+define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){
+; CHECK-LABEL: smax_v1i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmgt d2, d0, d1
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: smax_v1i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ptrue p0.d, vl1
+; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-SVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b)
+ ret <1 x i64> %0
+}
+
+; This is legal for Neon, so this should use the Neon smax.
+define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){
+; CHECK-LABEL: smax_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+;
+; CHECK-SVE-LABEL: smax_v4i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT: ret
+entry:
+ %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %0
+}
|
SamTebbs33
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The predicate is likely to be hoisted, so in a loop, this would result in a single SVE instruction, which should have lower latency.