-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT #90065
Conversation
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesPatch is 52.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90065.diff 6 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb43cbe15f5225..827537818f059f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
+ setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1431,6 +1433,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, VT, Custom);
}
+ setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
+
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
@@ -1731,6 +1735,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
+ if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
+ setOperationAction(ISD::LRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::LRINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v4f64, Legal);
+ }
// This block controls legalization for 512-bit operations with 8/16/32/64 bit
// elements. 512-bits can be disabled based on prefer-vector-width and
@@ -1765,6 +1779,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
+ setOperationAction(ISD::LRINT, MVT::v16f32,
+ Subtarget.hasDQI() ? Legal : Custom);
+ setOperationAction(ISD::LRINT, MVT::v8f64,
+ Subtarget.hasDQI() ? Legal : Custom);
+ if (Subtarget.hasDQI())
+ setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
@@ -2488,6 +2508,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FMAXNUM,
ISD::SUB,
ISD::LOAD,
+ ISD::LRINT,
+ ISD::LLRINT,
ISD::MLOAD,
ISD::STORE,
ISD::MSTORE,
@@ -21159,10 +21181,15 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ EVT DstVT = Op.getSimpleValueType();
MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.isVector())
+ return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
+
if (SrcVT == MVT::f16)
return SDValue();
@@ -32217,7 +32244,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::LRINT:
- case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, Subtarget, DAG);
case ISD::SETCC:
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
@@ -51556,6 +51583,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDLoc DL(N);
+
+ if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
+ SrcVT != MVT::v2f32)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+ DAG.getUNDEF(SrcVT)));
+}
+
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
/// the codegen.
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -51902,6 +51945,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
+ // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
+ if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
+ Src.hasOneUse())
+ return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
return SDValue();
}
@@ -56848,6 +56896,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP:
case ISD::STRICT_UINT_TO_FP:
return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::LRINT:
+ case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case X86ISD::VFCMULC:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e348ba6e8ac085..eea771d235b2da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1673,7 +1673,8 @@ namespace llvm {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLRINT_LLRINT(SDValue Op, const X86Subtarget &STI,
+ SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 43a40f5e691ea3..ec2a5f52a7b6aa 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in {
def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>;
+ def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>;
+ def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>;
+ def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>;
}
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;
+def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>;
+def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>;
+def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>;
let Predicates = [HasDQI, HasVLX] in {
def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
@@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in {
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+ def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+ def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+ def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+ def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+ def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
}
let Predicates = [HasVLX] in {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 063b572761e7d1..62b9b93953ad5a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
(v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
-
// Convert Packed Double FP to Packed DW Integers
let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
}
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>;
+ def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>;
+ def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>;
+}
+
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 46904f82fd5d6d..0be58ca86aa626 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX-512
define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; X64-SSE-LABEL: llrint_v1i64_v1f32:
@@ -9,10 +9,10 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; X64-SSE-NEXT: cvtss2si %xmm0, %rax
; X64-SSE-NEXT: retq
;
-; X64-AVX-LABEL: llrint_v1i64_v1f32:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT: retq
+; CHECK-LABEL: llrint_v1i64_v1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtss2si %xmm0, %rax
+; CHECK-NEXT: retq
%a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
ret <1 x i64> %a
}
@@ -39,6 +39,11 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
; X64-AVX-NEXT: vmovq %rax, %xmm0
; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v2i64_v2f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %xmm0, %xmm0
+; X64-AVX-512-NEXT: retq
%a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
ret <2 x i64> %a
}
@@ -64,6 +69,29 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; X64-SSE-NEXT: movdqa %xmm2, %xmm0
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v4i64_v4f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v4i64_v4f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %xmm0, %ymm0
+; X64-AVX-512-NEXT: retq
%a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
ret <4 x i64> %a
}
@@ -105,6 +133,45 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
; X64-SSE-NEXT: movdqa %xmm4, %xmm1
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v8i64_v8f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-AVX-NEXT: vmovaps %ymm2, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v8i64_v8f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm0
+; X64-AVX-512-NEXT: retq
%a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
ret <8 x i64> %a
}
@@ -183,6 +250,78 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
; X64-SSE-NEXT: movdqa %xmm0, 16(%rdi)
; X64-SSE-NEXT: movdqa %xmm4, (%rdi)
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v16i64_v16f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm2
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X64-AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; X64-AVX-NEXT: vmovaps %ymm4, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v16i64_v16f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm2
+; X64-AVX-512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm1
+; X64-AVX-512-NEXT: vmovaps %zmm2, %zmm0
+; X64-AVX-512-NEXT: retq
%a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
ret <16 x i64> %a
}
@@ -194,10 +333,10 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
; X64-SSE-NEXT: cvtsd2si %xmm0, %rax
; X64-SSE-NEXT: retq
;
-; X64-AVX-LABEL: llrint_v1i64_v1f64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT: retq
+; CHECK-LABEL: llrint_v1i64_v1f64:
+; CHECK: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What I gather is that you've added support for lowering vector [l]lrint for AVX. Further, it seems AVX can only pack vectors with i64. Please mention these in the commit subject and body. Otherwise, I'd like a clarification on the MVT you've matched.
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |||
setOperationAction(ISD::FABS, MVT::v2f64, Custom); | |||
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); | |||
|
|||
setOperationAction(ISD::LRINT, MVT::v4f32, Custom); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this correct? The MVT of ISD::LRINT
is usually set to its output type, not input type. Shouldn't this be MVT::v4i32
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it's correct. ISD::LRINT
uses input type, see https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp#L1002-L1006
@@ -1731,6 +1735,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |||
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) | |||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); | |||
} | |||
if (Subtarget.hasDQI() && Subtarget.hasVLX()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not qualified to review this line, as I don't know what DQI
or VLX
are.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's mapped to avx512dq
and avx512vl
features.
Added the relationship in description, thanks! |
@@ -1673,7 +1673,8 @@ namespace llvm { | |||
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; | |||
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; | |||
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; | |||
SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; | |||
SDValue LowerLRINT_LLRINT(SDValue Op, const X86Subtarget &STI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need to provide the X86Subtarget arg? Subtarget is available in the X86TargetLowering class
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch! Done.
} | ||
def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do these need to be wrapped in a Predicates = [HasAvx512]
check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we don't bother to check it since VR512 is not available without AVX512F + EXEV512
. We saved a lot of HasEVEX512
check when introduing AVX10.
llvm/lib/Target/X86/X86InstrSSE.td
Outdated
def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>; | ||
} | ||
|
||
let Predicates = [HasSSE2] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UseSSE2 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX | ||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX | ||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,X64-AVX | ||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX-512 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to use AVX instead of CHECK (which tends to be used for universal checks)?
Should we bother testing on a non-DQ/non-VLX AVX512 target?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX1-i64 | ||
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64 | ||
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
worth keeping avx512f only test coverage as well as dq+vl?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
|
||
return DAG.getNode(X86ISD::CVTP2SI, DL, VT, | ||
DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src, | ||
DAG.getUNDEF(SrcVT))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be handled in in ReplaceNodeResults ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, the result type is legal.
✅ With the latest revision this PR passed the C/C++ code formatter. |
X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to llvm#90065
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to llvm#90065
X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to #90065
f64/f32
->i32
conversions that can be mapped tollvm.lrint.vNi32.vNf64/32
since SSE2. AVX and AVX512 added 256-bit and 512-bit support;f64/f32
->i64
conversions that can be mapped tollvm.l[l]rint.vNi64.vNf64/32
since AVX512DQ. All 128-bit, 256-bit (require AVX512VL) and 512-bit are supported.