diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bb43cbe15f522..c149ee289b268 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); + setOperationAction(ISD::LRINT, MVT::v4f32, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); @@ -1431,6 +1433,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINIMUM, VT, Custom); } + setOperationAction(ISD::LRINT, MVT::v8f32, Custom); + setOperationAction(ISD::LRINT, MVT::v4f64, Custom); + // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); @@ -1731,6 +1736,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } + if (Subtarget.hasDQI() && Subtarget.hasVLX()) { + for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::LRINT, VT, Legal); + setOperationAction(ISD::LLRINT, VT, Legal); + } + } // This block controls legalization for 512-bit operations with 8/16/32/64 bit // elements. 512-bits can be disabled based on prefer-vector-width and @@ -1765,6 +1776,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } + setOperationAction(ISD::LRINT, MVT::v16f32, + Subtarget.hasDQI() ? Legal : Custom); + setOperationAction(ISD::LRINT, MVT::v8f64, + Subtarget.hasDQI() ? Legal : Custom); + if (Subtarget.hasDQI()) + setOperationAction(ISD::LLRINT, MVT::v8f64, Legal); for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); @@ -2488,6 +2505,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FMAXNUM, ISD::SUB, ISD::LOAD, + ISD::LRINT, + ISD::LLRINT, ISD::MLOAD, ISD::STORE, ISD::MSTORE, @@ -21161,8 +21180,12 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + EVT DstVT = Op.getSimpleValueType(); MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) + return DstVT.getScalarType() == MVT::i32 ? Op : SDValue(); + if (SrcVT == MVT::f16) return SDValue(); @@ -51556,6 +51579,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + SDLoc DL(N); + + if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 || + SrcVT != MVT::v2f32) + return SDValue(); + + return DAG.getNode(X86ISD::CVTP2SI, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src, + DAG.getUNDEF(SrcVT))); +} + /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify /// the codegen. /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) @@ -51902,6 +51941,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } + // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)). + if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 && + Src.hasOneUse()) + return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0)); + return SDValue(); } @@ -56848,6 +56892,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: case ISD::STRICT_UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); + case ISD::LRINT: + case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case X86ISD::VFCMULC: diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 43a40f5e691ea..ec2a5f52a7b6a 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in { def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>; + def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>; + def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>; + def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>; + def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>; } +def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>; +def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>; +def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>; +def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>; let Predicates = [HasDQI, HasVLX] in { def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), @@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in { (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), v2i64x_info.ImmAllZerosV)), (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>; + def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>; + def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>; + def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>; + def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>; + def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>; + def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>; + def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>; + def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>; + def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>; + def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>; + def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>; +} + +let Predicates = [HasDQI] in { + def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>; + def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>; + def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>; + def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>; + def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>; + def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>; + def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>; + def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>; } let Predicates = [HasVLX] in { diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 063b572761e7d..bc15085f6c7b7 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, Sched<[WriteCvtPS2ILd]>, SIMD_EXC; - // Convert Packed Double FP to Packed DW Integers let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm @@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG; } +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>; + def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>; + def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>; +} + def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll index 46904f82fd5d6..7017eb60df41d 100644 --- a/llvm/test/CodeGen/X86/vector-llrint.ll +++ b/llvm/test/CodeGen/X86/vector-llrint.ll @@ -1,289 +1,674 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX512DQ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) { -; X64-SSE-LABEL: llrint_v1i64_v1f32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v1i64_v1f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: retq ; -; X64-AVX-LABEL: llrint_v1i64_v1f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX-NEXT: retq +; AVX-LABEL: llrint_v1i64_v1f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtss2si %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v1i64_v1f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtss2si %xmm0, %rax +; AVX512DQ-NEXT: retq %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x) ret <1 x i64> %a } declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>) define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) { -; X64-SSE-LABEL: llrint_v2i64_v2f32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm1 -; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v2i64_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: llrint_v2i64_v2f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtss2si %xmm0, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vcvtss2si %xmm0, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq ; -; X64-AVX-LABEL: llrint_v2i64_v2f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX-NEXT: vmovq %rax, %xmm1 -; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX-NEXT: vmovq %rax, %xmm0 -; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: retq +; AVX512DQ-LABEL: llrint_v2i64_v2f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtps2qq %xmm0, %xmm0 +; AVX512DQ-NEXT: retq %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x) ret <2 x i64> %a } declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>) define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) { -; X64-SSE-LABEL: llrint_v4i64_v4f32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm2 -; X64-SSE-NEXT: movaps %xmm0, %xmm1 -; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; X64-SSE-NEXT: cvtss2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm1 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-SSE-NEXT: movaps %xmm0, %xmm1 -; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; X64-SSE-NEXT: cvtss2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm1 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v4i64_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: llrint_v4i64_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: llrint_v4i64_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v4i64_v4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtps2qq %xmm0, %ymm0 +; AVX512DQ-NEXT: retq %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x) ret <4 x i64> %a } declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>) define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) { -; X64-SSE-LABEL: llrint_v8i64_v8f32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps %xmm0, %xmm2 -; X64-SSE-NEXT: cvtss2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: movaps %xmm2, %xmm3 -; X64-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; X64-SSE-NEXT: cvtss2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; X64-SSE-NEXT: movaps %xmm2, %xmm3 -; X64-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; X64-SSE-NEXT: cvtss2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; X64-SSE-NEXT: cvtss2si %xmm2, %rax -; X64-SSE-NEXT: movq %rax, %xmm4 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; X64-SSE-NEXT: cvtss2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm2 -; X64-SSE-NEXT: movaps %xmm1, %xmm3 -; X64-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; X64-SSE-NEXT: cvtss2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-SSE-NEXT: movaps %xmm1, %xmm3 -; X64-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; X64-SSE-NEXT: cvtss2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm5 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE-NEXT: cvtss2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; X64-SSE-NEXT: movdqa %xmm4, %xmm1 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v8i64_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; SSE-NEXT: cvtss2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: cvtss2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: cvtss2si %xmm2, %rax +; SSE-NEXT: movq %rax, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] +; SSE-NEXT: cvtss2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm5 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: llrint_v8i64_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: llrint_v8i64_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v8i64_v8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: retq %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x) ret <8 x i64> %a } declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>) define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { -; X64-SSE-LABEL: llrint_v16i64_v16f32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq %rdi, %rax -; X64-SSE-NEXT: cvtss2si %xmm0, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm4 -; X64-SSE-NEXT: movaps %xmm0, %xmm5 -; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; X64-SSE-NEXT: cvtss2si %xmm5, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm5 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; X64-SSE-NEXT: movaps %xmm0, %xmm5 -; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] -; X64-SSE-NEXT: cvtss2si %xmm5, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm5 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE-NEXT: cvtss2si %xmm0, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; X64-SSE-NEXT: cvtss2si %xmm1, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm5 -; X64-SSE-NEXT: movaps %xmm1, %xmm6 -; X64-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1] -; X64-SSE-NEXT: cvtss2si %xmm6, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm6 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; X64-SSE-NEXT: movaps %xmm1, %xmm6 -; X64-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3] -; X64-SSE-NEXT: cvtss2si %xmm6, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm6 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE-NEXT: cvtss2si %xmm1, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; X64-SSE-NEXT: cvtss2si %xmm2, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm6 -; X64-SSE-NEXT: movaps %xmm2, %xmm7 -; X64-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; X64-SSE-NEXT: cvtss2si %xmm7, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm7 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; X64-SSE-NEXT: movaps %xmm2, %xmm7 -; X64-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3] -; X64-SSE-NEXT: cvtss2si %xmm7, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm7 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; X64-SSE-NEXT: cvtss2si %xmm2, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm2 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; X64-SSE-NEXT: cvtss2si %xmm3, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm7 -; X64-SSE-NEXT: movaps %xmm3, %xmm8 -; X64-SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] -; X64-SSE-NEXT: cvtss2si %xmm8, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm8 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; X64-SSE-NEXT: movaps %xmm3, %xmm8 -; X64-SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] -; X64-SSE-NEXT: cvtss2si %xmm8, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm8 -; X64-SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] -; X64-SSE-NEXT: cvtss2si %xmm3, %rcx -; X64-SSE-NEXT: movq %rcx, %xmm3 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] -; X64-SSE-NEXT: movdqa %xmm3, 112(%rdi) -; X64-SSE-NEXT: movdqa %xmm7, 96(%rdi) -; X64-SSE-NEXT: movdqa %xmm2, 80(%rdi) -; X64-SSE-NEXT: movdqa %xmm6, 64(%rdi) -; X64-SSE-NEXT: movdqa %xmm1, 48(%rdi) -; X64-SSE-NEXT: movdqa %xmm5, 32(%rdi) -; X64-SSE-NEXT: movdqa %xmm0, 16(%rdi) -; X64-SSE-NEXT: movdqa %xmm4, (%rdi) -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v16i64_v16f32: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: cvtss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm5, %rcx +; SSE-NEXT: movq %rcx, %xmm5 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] +; SSE-NEXT: cvtss2si %xmm5, %rcx +; SSE-NEXT: movq %rcx, %xmm5 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: cvtss2si %xmm1, %rcx +; SSE-NEXT: movq %rcx, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm6, %rcx +; SSE-NEXT: movq %rcx, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3] +; SSE-NEXT: cvtss2si %xmm6, %rcx +; SSE-NEXT: movq %rcx, %xmm6 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm1, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: cvtss2si %xmm2, %rcx +; SSE-NEXT: movq %rcx, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] +; SSE-NEXT: cvtss2si %xmm7, %rcx +; SSE-NEXT: movq %rcx, %xmm7 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3] +; SSE-NEXT: cvtss2si %xmm7, %rcx +; SSE-NEXT: movq %rcx, %xmm7 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: cvtss2si %xmm2, %rcx +; SSE-NEXT: movq %rcx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: cvtss2si %xmm3, %rcx +; SSE-NEXT: movq %rcx, %xmm7 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] +; SSE-NEXT: cvtss2si %xmm8, %rcx +; SSE-NEXT: movq %rcx, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] +; SSE-NEXT: cvtss2si %xmm8, %rcx +; SSE-NEXT: movq %rcx, %xmm8 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: cvtss2si %xmm3, %rcx +; SSE-NEXT: movq %rcx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: movdqa %xmm3, 112(%rdi) +; SSE-NEXT: movdqa %xmm7, 96(%rdi) +; SSE-NEXT: movdqa %xmm2, 80(%rdi) +; SSE-NEXT: movdqa %xmm6, 64(%rdi) +; SSE-NEXT: movdqa %xmm1, 48(%rdi) +; SSE-NEXT: movdqa %xmm5, 32(%rdi) +; SSE-NEXT: movdqa %xmm0, 16(%rdi) +; SSE-NEXT: movdqa %xmm4, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: llrint_v16i64_v16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %ymm0, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm4, %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vcvtss2si %xmm4, %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm5, %rax +; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX1-NEXT: vcvtss2si %xmm3, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX1-NEXT: vcvtss2si %xmm5, %rax +; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 +; AVX1-NEXT: vmovaps %ymm4, %ymm1 +; AVX1-NEXT: retq +; +; AVX512-LABEL: llrint_v16i64_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm4, %rax +; AVX512-NEXT: vmovq %rax, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX512-NEXT: vcvtss2si %xmm4, %rax +; AVX512-NEXT: vmovq %rax, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtss2si %xmm3, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vcvtss2si %xmm4, %rax +; AVX512-NEXT: vmovq %rax, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v16i64_v16f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtps2qq %ymm0, %zmm2 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vcvtps2qq %ymm0, %zmm1 +; AVX512DQ-NEXT: vmovaps %zmm2, %zmm0 +; AVX512DQ-NEXT: retq %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x) ret <16 x i64> %a } declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>) define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) { -; X64-SSE-LABEL: llrint_v1i64_v1f64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v1i64_v1f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: llrint_v1i64_v1f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsd2si %xmm0, %rax +; AVX-NEXT: retq ; -; X64-AVX-LABEL: llrint_v1i64_v1f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX-NEXT: retq +; AVX512DQ-LABEL: llrint_v1i64_v1f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtsd2si %xmm0, %rax +; AVX512DQ-NEXT: retq %a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x) ret <1 x i64> %a } declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>) define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) { -; X64-SSE-LABEL: llrint_v2i64_v2f64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm1 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v2i64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: llrint_v2i64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsd2si %xmm0, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vcvtsd2si %xmm0, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq ; -; X64-AVX-LABEL: llrint_v2i64_v2f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX-NEXT: vmovq %rax, %xmm1 -; X64-AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX-NEXT: vmovq %rax, %xmm0 -; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: retq +; AVX512DQ-LABEL: llrint_v2i64_v2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtpd2qq %xmm0, %xmm0 +; AVX512DQ-NEXT: retq %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x) ret <2 x i64> %a } declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>) define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) { -; X64-SSE-LABEL: llrint_v4i64_v4f64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm2 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-SSE-NEXT: cvtsd2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm3 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v4i64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: cvtsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvtsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: llrint_v4i64_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vcvtsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vcvtsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvtsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: llrint_v4i64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v4i64_v4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtpd2qq %ymm0, %ymm0 +; AVX512DQ-NEXT: retq %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x) ret <4 x i64> %a } declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>) define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { -; X64-SSE-LABEL: llrint_v8i64_v8f64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm4 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm0, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; X64-SSE-NEXT: cvtsd2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm5 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm1, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; X64-SSE-NEXT: cvtsd2si %xmm2, %rax -; X64-SSE-NEXT: movq %rax, %xmm6 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm2, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; X64-SSE-NEXT: cvtsd2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm7 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; X64-SSE-NEXT: cvtsd2si %xmm3, %rax -; X64-SSE-NEXT: movq %rax, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; X64-SSE-NEXT: movdqa %xmm4, %xmm0 -; X64-SSE-NEXT: movdqa %xmm5, %xmm1 -; X64-SSE-NEXT: movdqa %xmm6, %xmm2 -; X64-SSE-NEXT: movdqa %xmm7, %xmm3 -; X64-SSE-NEXT: retq +; SSE-LABEL: llrint_v8i64_v8f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: cvtsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvtsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: cvtsd2si %xmm2, %rax +; SSE-NEXT: movq %rax, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: cvtsd2si %xmm2, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: cvtsd2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: cvtsd2si %xmm3, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: llrint_v8i64_v8f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vcvtsd2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vcvtsd2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vcvtsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vcvtsd2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vcvtsd2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vcvtsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vcvtsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX512-LABEL: llrint_v8i64_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq +; +; AVX512DQ-LABEL: llrint_v8i64_v8f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: retq %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x) ret <8 x i64> %a } diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll index f527a3584f447..3612205bf1bfa 100644 --- a/llvm/test/CodeGen/X86/vector-lrint.ll +++ b/llvm/test/CodeGen/X86/vector-lrint.ll @@ -1,11 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefix=X86-SSE2 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1 -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,X86-AVX512 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX1-i32 -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX512-i32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX1-i64 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512-i64 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512DQ-i64 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) { ; X86-SSE2-LABEL: lrint_v1f32: @@ -35,64 +38,43 @@ declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>) define <2 x iXLen> @lrint_v2f32(<2 x float> %x) { ; X86-SSE2-LABEL: lrint_v2f32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; X86-SSE2-NEXT: cvtss2si %xmm1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; X86-SSE2-NEXT: cvtss2si %xmm2, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: cvtss2si %xmm0, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: cvtss2si %xmm0, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: cvtps2dq %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: lrint_v2f32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-AVX-NEXT: vcvtss2si %xmm1, %eax -; X86-AVX-NEXT: vcvtss2si %xmm0, %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X86-AVX-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-AVX-NEXT: vcvtss2si %xmm0, %eax -; X86-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X86-AVX-NEXT: vcvtps2dq %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-i32-LABEL: lrint_v2f32: ; X64-AVX-i32: # %bb.0: -; X64-AVX-i32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-AVX-i32-NEXT: vcvtss2si %xmm1, %eax -; X64-AVX-i32-NEXT: vcvtss2si %xmm0, %ecx -; X64-AVX-i32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-i32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X64-AVX-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX-i32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-AVX-i32-NEXT: vcvtss2si %xmm0, %eax -; X64-AVX-i32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X64-AVX-i32-NEXT: vcvtps2dq %xmm0, %xmm0 ; X64-AVX-i32-NEXT: retq ; -; X64-AVX-i64-LABEL: lrint_v2f32: -; X64-AVX-i64: # %bb.0: -; X64-AVX-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-i64-NEXT: retq +; X64-AVX1-i64-LABEL: lrint_v2f32: +; X64-AVX1-i64: # %bb.0: +; X64-AVX1-i64-NEXT: vcvtss2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vmovq %rax, %xmm1 +; X64-AVX1-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-i64-NEXT: vcvtss2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vmovq %rax, %xmm0 +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-i64-NEXT: retq +; +; AVX512-i64-LABEL: lrint_v2f32: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v2f32: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtps2qq %xmm0, %xmm0 +; AVX512DQ-i64-NEXT: retq %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x) ret <2 x iXLen> %a } @@ -101,53 +83,17 @@ declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>) define <4 x iXLen> @lrint_v4f32(<4 x float> %x) { ; X86-SSE2-LABEL: lrint_v4f32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; X86-SSE2-NEXT: cvtss2si %xmm1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; X86-SSE2-NEXT: cvtss2si %xmm2, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: cvtss2si %xmm0, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: cvtss2si %xmm0, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: cvtps2dq %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: lrint_v4f32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-AVX-NEXT: vcvtss2si %xmm1, %eax -; X86-AVX-NEXT: vcvtss2si %xmm0, %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X86-AVX-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-AVX-NEXT: vcvtss2si %xmm0, %eax -; X86-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X86-AVX-NEXT: vcvtps2dq %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-i32-LABEL: lrint_v4f32: ; X64-AVX-i32: # %bb.0: -; X64-AVX-i32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-AVX-i32-NEXT: vcvtss2si %xmm1, %eax -; X64-AVX-i32-NEXT: vcvtss2si %xmm0, %ecx -; X64-AVX-i32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-i32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X64-AVX-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX-i32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-AVX-i32-NEXT: vcvtss2si %xmm0, %eax -; X64-AVX-i32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X64-AVX-i32-NEXT: vcvtps2dq %xmm0, %xmm0 ; X64-AVX-i32-NEXT: retq ; ; X64-AVX1-i64-LABEL: lrint_v4f32: @@ -168,23 +114,28 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) { ; X64-AVX1-i64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-i64-NEXT: retq ; -; X64-AVX512-i64-LABEL: lrint_v4f32: -; X64-AVX512-i64: # %bb.0: -; X64-AVX512-i64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX512-i64-NEXT: retq +; AVX512-i64-LABEL: lrint_v4f32: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-i64-NEXT: vcvtss2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v4f32: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtps2qq %xmm0, %ymm0 +; AVX512DQ-i64-NEXT: retq %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x) ret <4 x iXLen> %a } @@ -193,152 +144,19 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>) define <8 x iXLen> @lrint_v8f32(<8 x float> %x) { ; X86-SSE2-LABEL: lrint_v8f32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-SSE2-NEXT: cvtss2si %xmm0, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movaps %xmm2, %xmm3 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; X86-SSE2-NEXT: cvtss2si %xmm3, %eax -; X86-SSE2-NEXT: movd %eax, %xmm3 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; X86-SSE2-NEXT: cvtss2si %xmm2, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: cvtss2si %xmm2, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; X86-SSE2-NEXT: movaps %xmm1, %xmm2 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; X86-SSE2-NEXT: cvtss2si %xmm2, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 -; X86-SSE2-NEXT: movaps %xmm1, %xmm3 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; X86-SSE2-NEXT: cvtss2si %xmm3, %eax -; X86-SSE2-NEXT: movd %eax, %xmm3 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-SSE2-NEXT: cvtss2si %xmm1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm2 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: cvtss2si %xmm1, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: cvtps2dq %xmm0, %xmm0 +; X86-SSE2-NEXT: cvtps2dq %xmm1, %xmm1 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: lrint_v8f32: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-AVX1-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX1-NEXT: vcvtss2si %xmm1, %ecx -; X86-AVX1-NEXT: vmovd %ecx, %xmm2 -; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X86-AVX1-NEXT: vcvtss2si %xmm3, %eax -; X86-AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X86-AVX1-NEXT: vcvtss2si %xmm1, %eax -; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X86-AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-AVX1-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX1-NEXT: vcvtss2si %xmm0, %ecx -; X86-AVX1-NEXT: vmovd %ecx, %xmm2 -; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; X86-AVX1-NEXT: vcvtss2si %xmm3, %eax -; X86-AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-AVX1-NEXT: vcvtss2si %xmm0, %eax -; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX512-LABEL: lrint_v8f32: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-AVX512-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX512-NEXT: vcvtss2si %xmm1, %ecx -; X86-AVX512-NEXT: vmovd %ecx, %xmm2 -; X86-AVX512-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X86-AVX512-NEXT: vcvtss2si %xmm3, %eax -; X86-AVX512-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X86-AVX512-NEXT: vcvtss2si %xmm1, %eax -; X86-AVX512-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X86-AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-AVX512-NEXT: vcvtss2si %xmm2, %eax -; X86-AVX512-NEXT: vcvtss2si %xmm0, %ecx -; X86-AVX512-NEXT: vmovd %ecx, %xmm2 -; X86-AVX512-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; X86-AVX512-NEXT: vcvtss2si %xmm3, %eax -; X86-AVX512-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-AVX512-NEXT: vcvtss2si %xmm0, %eax -; X86-AVX512-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX512-NEXT: retl -; -; X64-AVX1-i32-LABEL: lrint_v8f32: -; X64-AVX1-i32: # %bb.0: -; X64-AVX1-i32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-i32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX1-i32-NEXT: vcvtss2si %xmm1, %ecx -; X64-AVX1-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX1-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm3, %eax -; X64-AVX1-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm1, %eax -; X64-AVX1-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X64-AVX1-i32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX1-i32-NEXT: vcvtss2si %xmm0, %ecx -; X64-AVX1-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX1-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm3, %eax -; X64-AVX1-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-AVX1-i32-NEXT: vcvtss2si %xmm0, %eax -; X64-AVX1-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X64-AVX1-i32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-i32-NEXT: retq +; X86-AVX-LABEL: lrint_v8f32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vcvtps2dq %ymm0, %ymm0 +; X86-AVX-NEXT: retl ; -; X64-AVX512-i32-LABEL: lrint_v8f32: -; X64-AVX512-i32: # %bb.0: -; X64-AVX512-i32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX512-i32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vcvtss2si %xmm1, %ecx -; X64-AVX512-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX512-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm3, %eax -; X64-AVX512-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm1, %eax -; X64-AVX512-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X64-AVX512-i32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vcvtss2si %xmm0, %ecx -; X64-AVX512-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX512-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm3, %eax -; X64-AVX512-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-AVX512-i32-NEXT: vcvtss2si %xmm0, %eax -; X64-AVX512-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X64-AVX512-i32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX512-i32-NEXT: retq +; X64-AVX-i32-LABEL: lrint_v8f32: +; X64-AVX-i32: # %bb.0: +; X64-AVX-i32-NEXT: vcvtps2dq %ymm0, %ymm0 +; X64-AVX-i32-NEXT: retq ; ; X64-AVX1-i64-LABEL: lrint_v8f32: ; X64-AVX1-i64: # %bb.0: @@ -374,39 +192,44 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) { ; X64-AVX1-i64-NEXT: vmovaps %ymm2, %ymm0 ; X64-AVX1-i64-NEXT: retq ; -; X64-AVX512-i64-LABEL: lrint_v8f32: -; X64-AVX512-i64: # %bb.0: -; X64-AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm3, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; X64-AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm3, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX512-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X64-AVX512-i64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X64-AVX512-i64-NEXT: retq +; AVX512-i64-LABEL: lrint_v8f32: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-i64-NEXT: vcvtss2si %xmm3, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vcvtss2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-i64-NEXT: vcvtss2si %xmm3, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-i64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v8f32: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtps2qq %ymm0, %zmm0 +; AVX512DQ-i64-NEXT: retq %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x) ret <8 x iXLen> %a } @@ -473,15 +296,30 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) { ; X64-AVX-i32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; X64-AVX-i32-NEXT: retq ; -; X64-AVX-i64-LABEL: lrint_v2f64: -; X64-AVX-i64: # %bb.0: -; X64-AVX-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-i64-NEXT: retq +; X64-AVX1-i64-LABEL: lrint_v2f64: +; X64-AVX1-i64: # %bb.0: +; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vmovq %rax, %xmm1 +; X64-AVX1-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vmovq %rax, %xmm0 +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-i64-NEXT: retq +; +; AVX512-i64-LABEL: lrint_v2f64: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v2f64: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtpd2qq %xmm0, %xmm0 +; AVX512DQ-i64-NEXT: retq %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x) ret <2 x iXLen> %a } @@ -508,33 +346,13 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) { ; ; X86-AVX-LABEL: lrint_v4f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-AVX-NEXT: vcvtsd2si %xmm1, %eax -; X86-AVX-NEXT: vcvtsd2si %xmm0, %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X86-AVX-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X86-AVX-NEXT: vcvtpd2dq %ymm0, %xmm0 ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl ; ; X64-AVX-i32-LABEL: lrint_v4f64: ; X64-AVX-i32: # %bb.0: -; X64-AVX-i32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-AVX-i32-NEXT: vcvtsd2si %xmm1, %eax -; X64-AVX-i32-NEXT: vcvtsd2si %xmm0, %ecx -; X64-AVX-i32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-i32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX-i32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-AVX-i32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX-i32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; X64-AVX-i32-NEXT: vcvtpd2dq %ymm0, %xmm0 ; X64-AVX-i32-NEXT: vzeroupper ; X64-AVX-i32-NEXT: retq ; @@ -556,23 +374,28 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) { ; X64-AVX1-i64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-i64-NEXT: retq ; -; X64-AVX512-i64-LABEL: lrint_v4f64: -; X64-AVX512-i64: # %bb.0: -; X64-AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX512-i64-NEXT: retq +; AVX512-i64-LABEL: lrint_v4f64: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v4f64: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtpd2qq %ymm0, %ymm0 +; AVX512DQ-i64-NEXT: retq %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x) ret <4 x iXLen> %a } @@ -623,114 +446,23 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; ; X86-AVX1-LABEL: lrint_v8f64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; X86-AVX1-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX1-NEXT: vcvtsd2si %xmm1, %ecx -; X86-AVX1-NEXT: vmovd %ecx, %xmm2 -; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vcvtsd2si %xmm1, %eax -; X86-AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; X86-AVX1-NEXT: vcvtsd2si %xmm1, %eax -; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X86-AVX1-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX1-NEXT: vcvtsd2si %xmm0, %ecx -; X86-AVX1-NEXT: vmovd %ecx, %xmm2 -; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X86-AVX1-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; X86-AVX1-NEXT: vcvtpd2dq %ymm0, %xmm0 +; X86-AVX1-NEXT: vcvtpd2dq %ymm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; -; X86-AVX512-LABEL: lrint_v8f64: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; X86-AVX512-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX512-NEXT: vcvtsd2si %xmm1, %ecx -; X86-AVX512-NEXT: vmovd %ecx, %xmm1 -; X86-AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; X86-AVX512-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; X86-AVX512-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X86-AVX512-NEXT: vcvtsd2si %xmm2, %eax -; X86-AVX512-NEXT: vcvtsd2si %xmm0, %ecx -; X86-AVX512-NEXT: vmovd %ecx, %xmm2 -; X86-AVX512-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX512-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX512-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X86-AVX512-NEXT: vcvtsd2si %xmm0, %eax -; X86-AVX512-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX512-NEXT: retl +; AVX512-i32-LABEL: lrint_v8f64: +; AVX512-i32: # %bb.0: +; AVX512-i32-NEXT: vcvtpd2dq %zmm0, %ymm0 +; AVX512-i32-NEXT: ret{{[l|q]}} ; ; X64-AVX1-i32-LABEL: lrint_v8f64: ; X64-AVX1-i32: # %bb.0: -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm1, %ecx -; X64-AVX1-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX1-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm1, %eax -; X64-AVX1-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm1, %eax -; X64-AVX1-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm0, %ecx -; X64-AVX1-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX1-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX1-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX1-i32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX1-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX1-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; X64-AVX1-i32-NEXT: vcvtpd2dq %ymm0, %xmm0 +; X64-AVX1-i32-NEXT: vcvtpd2dq %ymm1, %xmm1 ; X64-AVX1-i32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-i32-NEXT: retq ; -; X64-AVX512-i32-LABEL: lrint_v8f64: -; X64-AVX512-i32: # %bb.0: -; X64-AVX512-i32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm1, %ecx -; X64-AVX512-i32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX512-i32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-AVX512-i32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm2, %eax -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm0, %ecx -; X64-AVX512-i32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX512-i32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX512-i32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX512-i32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX512-i32-NEXT: vcvtsd2si %xmm0, %eax -; X64-AVX512-i32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; X64-AVX512-i32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX512-i32-NEXT: retq -; ; X64-AVX1-i64-LABEL: lrint_v8f64: ; X64-AVX1-i64: # %bb.0: ; X64-AVX1-i64-NEXT: vextractf128 $1, %ymm0, %xmm2 @@ -763,39 +495,44 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; X64-AVX1-i64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; X64-AVX1-i64-NEXT: retq ; -; X64-AVX512-i64-LABEL: lrint_v8f64: -; X64-AVX512-i64: # %bb.0: -; X64-AVX512-i64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm1 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X64-AVX512-i64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; X64-AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm3 -; X64-AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX512-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; X64-AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X64-AVX512-i64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X64-AVX512-i64-NEXT: retq +; AVX512-i64-LABEL: lrint_v8f64: +; AVX512-i64: # %bb.0: +; AVX512-i64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm1 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-i64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm3 +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vmovq %rax, %xmm0 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-i64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-i64-NEXT: retq +; +; AVX512DQ-i64-LABEL: lrint_v8f64: +; AVX512DQ-i64: # %bb.0: +; AVX512DQ-i64-NEXT: vcvtpd2qq %zmm0, %zmm0 +; AVX512DQ-i64-NEXT: retq %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x) ret <8 x iXLen> %a }