Skip to content

Commit

Permalink
[X86][AVX512DQ] Use packed instructions for scalar FP<->i64 conversio…
Browse files Browse the repository at this point in the history
…ns on 32-bit targets

As i64 types are not legal on 32-bit targets, insert these into a suitable zero vector and use the packed vXi64<->FP conversion instructions instead.

Fixes PR3163.

Differential Revision: https://reviews.llvm.org/D43441

llvm-svn: 332498
  • Loading branch information
topperc committed May 16, 2018
1 parent 5c6b3fb commit 67aa726
Show file tree
Hide file tree
Showing 4 changed files with 555 additions and 238 deletions.
70 changes: 62 additions & 8 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -16035,6 +16035,34 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues(Ops, dl);
}

// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP ||
Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
(VT != MVT::f32 && VT != MVT::f64))
return SDValue();

// Pack the i64 into a vector, do the operation and extract.

// Using 256-bit to ensure result is 128-bits for f32 case.
unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecVT = MVT::getVectorVT(VT, NumElts);

SDLoc dl(Op);
SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
}

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
Expand All @@ -16056,15 +16084,17 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

// These are really Legal; return the operand so the caller accepts it as
// Legal.
if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
return Op;
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
Subtarget.is64Bit()) {
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
return Op;
}

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;

SDValue ValueToStore = Op.getOperand(0);
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
!Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
Expand Down Expand Up @@ -16415,6 +16445,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return Op;
}

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
Expand Down Expand Up @@ -25191,12 +25224,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();

if (N->getValueType(0) == MVT::v2i32) {
if (VT == MVT::v2i32) {
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
SDValue Src = N->getOperand(0);
if (Src.getValueType() == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
Expand All @@ -25217,7 +25252,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
if (Src.getValueType() == MVT::v2f32) {
if (SrcVT == MVT::v2f32) {
SDValue Idx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
Expand All @@ -25234,11 +25269,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}

if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
// Using a 256-bit input here to guarantee 128-bit input for f32 case.
// TODO: Use 128-bit vectors for f64 case?
// TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
DAG.getConstantFP(0.0, dl, VecInVT), Src,
ZeroIdx);
Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
Results.push_back(Res);
return;
}

std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
if (FIST.getNode()) {
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
if (StackSlot.getNode())
Results.push_back(
Expand Down
29 changes: 12 additions & 17 deletions llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
Expand Up @@ -1166,30 +1166,25 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 x float>
define x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signext, i32, i64, i16 signext, i32*) #0 {
; X32-LABEL: test_argRetMixTypes:
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-8, %esp
; X32-NEXT: subl $16, %esp
; X32-NEXT: vmovd %edx, %xmm2
; X32-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2
; X32-NEXT: movl 8(%ebp), %edx
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X32-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; X32-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; X32-NEXT: vcvtsi2sdl %eax, %xmm3, %xmm1
; X32-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vcvtsi2sdl %ecx, %xmm2, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vcvtsi2sdl %ecx, %xmm3, %xmm1
; X32-NEXT: vmovd %edx, %xmm1
; X32-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
; X32-NEXT: vcvtqq2pd %ymm1, %ymm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp)
; X32-NEXT: fildll {{[0-9]+}}(%esp)
; X32-NEXT: fstpl (%esp)
; X32-NEXT: vaddsd (%esp), %xmm0, %xmm0
; X32-NEXT: vcvtsi2sdl %esi, %xmm3, %xmm1
; X32-NEXT: vcvtsi2sdl %esi, %xmm2, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vcvtsi2sdl (%edx), %xmm3, %xmm1
; X32-NEXT: vcvtsi2sdl (%ebx), %xmm2, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vcvttsd2si %xmm0, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: popl %ebx
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; WIN64-LABEL: test_argRetMixTypes:
Expand Down

0 comments on commit 67aa726

Please sign in to comment.