-
Notifications
You must be signed in to change notification settings - Fork 11.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Extend v2i64 fptosi.sat to v2f64 #91714
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis helps it produce a single instruction for the saturate, as opposed to having to scalarize. Full diff: https://github.com/llvm/llvm-project/pull/91714.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7344387ffe552..09de09195048d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4273,6 +4273,15 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
return SDValue();
SDLoc DL(Op);
+ // Expand to f64 if we are saturating to i64, to help produce keep the lanes
+ // the same width and produce a fcvtzu.
+ if (SatWidth == 64 && SrcElementWidth < 64) {
+ MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
+ SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
+ SrcVT = F64VT;
+ SrcElementVT = MVT::f64;
+ SrcElementWidth = 64;
+ }
// Cases that we can emit directly.
if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 29170aab96566..62669a6d99eae 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -345,11 +345,8 @@ define <2 x i64> @test6_sat(<2 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov v1.2s, #16.00000000
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: mov s1, v0.s[1]
-; CHECK-NEXT: fcvtzs x8, s0
-; CHECK-NEXT: fcvtzs x9, s1
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: mov v0.d[1], x9
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
%vcvt.i = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %mul.i)
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 2ea581359af6f..4e8bfcd9d7516 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -436,12 +436,8 @@ entry:
define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-LABEL: stest_f32i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov s1, v0.s[1]
-; CHECK-NEXT: fcvtzs x8, s0
-; CHECK-NEXT: fcvtzs x9, s1
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: mov v0.d[1], x9
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
entry:
%conv = fptosi <2 x float> %x to <2 x i128>
@@ -1056,12 +1052,8 @@ entry:
define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
; CHECK-LABEL: stest_f32i64_mm:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov s1, v0.s[1]
-; CHECK-NEXT: fcvtzs x8, s0
-; CHECK-NEXT: fcvtzs x9, s1
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: mov v0.d[1], x9
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
entry:
%conv = fptosi <2 x float> %x to <2 x i128>
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c45885a38f159..d620a8851ee44 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -793,12 +793,8 @@ define <2 x i50> @test_signed_v2f32_v2i50(<2 x float> %f) {
define <2 x i64> @test_signed_v2f32_v2i64(<2 x float> %f) {
; CHECK-LABEL: test_signed_v2f32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov s1, v0.s[1]
-; CHECK-NEXT: fcvtzs x8, s0
-; CHECK-NEXT: fcvtzs x9, s1
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: mov v0.d[1], x9
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
%x = call <2 x i64> @llvm.fptosi.sat.v2f32.v2i64(<2 x float> %f)
ret <2 x i64> %x
@@ -1060,17 +1056,10 @@ define <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
define <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
; CHECK-LABEL: test_signed_v4f32_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: mov s3, v0.s[1]
-; CHECK-NEXT: fcvtzs x9, s0
-; CHECK-NEXT: mov s2, v1.s[1]
-; CHECK-NEXT: fcvtzs x8, s1
-; CHECK-NEXT: fcvtzs x11, s3
-; CHECK-NEXT: fmov d0, x9
-; CHECK-NEXT: fcvtzs x10, s2
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], x11
-; CHECK-NEXT: mov v1.d[1], x10
+; CHECK-NEXT: fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
%x = call <4 x i64> @llvm.fptosi.sat.v4f32.v4i64(<4 x float> %f)
ret <4 x i64> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index c94db3484994c..16e04070b6543 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -707,12 +707,8 @@ define <2 x i50> @test_unsigned_v2f32_v2i50(<2 x float> %f) {
define <2 x i64> @test_unsigned_v2f32_v2i64(<2 x float> %f) {
; CHECK-LABEL: test_unsigned_v2f32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov s1, v0.s[1]
-; CHECK-NEXT: fcvtzu x8, s0
-; CHECK-NEXT: fcvtzu x9, s1
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: mov v0.d[1], x9
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-NEXT: ret
%x = call <2 x i64> @llvm.fptoui.sat.v2f32.v2i64(<2 x float> %f)
ret <2 x i64> %x
@@ -927,17 +923,10 @@ define <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
define <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
; CHECK-LABEL: test_unsigned_v4f32_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: mov s3, v0.s[1]
-; CHECK-NEXT: fcvtzu x9, s0
-; CHECK-NEXT: mov s2, v1.s[1]
-; CHECK-NEXT: fcvtzu x8, s1
-; CHECK-NEXT: fcvtzu x11, s3
-; CHECK-NEXT: fmov d0, x9
-; CHECK-NEXT: fcvtzu x10, s2
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], x11
-; CHECK-NEXT: mov v1.d[1], x10
+; CHECK-NEXT: fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-NEXT: ret
%x = call <4 x i64> @llvm.fptoui.sat.v4f32.v4i64(<4 x float> %f)
ret <4 x i64> %x
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good to me, but please wait for one other reviewer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This helps it produce a single instruction for the saturate, as opposed to having to scalarize.
78f286c
to
e75687a
Compare
This helps it produce a single instruction for the saturate, as opposed to having to scalarize.