Skip to content

Commit

Permalink
[X86] Fix fptoui conversions
Browse files Browse the repository at this point in the history
This fixes two issues in x86 fptoui lowering.
1) Makes conversions from f80 go through the right path on AVX-512.
2) Implements an inline sequence for fptoui i64 instead of a library
call. This improves performance by 6X on SSE3+ and 3X otherwise.
Incidentally, it also removes the use of ftol2 for fptoui, which was
wrong to begin with, as ftol2 converts to a signed i64, producing
wrong results for values >= 2^63.

Patch by: mitch.l.bodart@intel.com
Differential Revision: http://reviews.llvm.org/D11316

llvm-svn: 245924
  • Loading branch information
Michael Kuperstein committed Aug 25, 2015
1 parent 2c8f9c2 commit 8515893
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 76 deletions.
203 changes: 143 additions & 60 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -114,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

// The _ftol2 runtime function has an unusual calling conv, which
// is modeled by a special pseudo-instruction.
setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
}

if (Subtarget->isTargetDarwin()) {
Expand Down Expand Up @@ -228,8 +221,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

if (Subtarget->is64Bit()) {
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
} else {
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
}
} else if (!Subtarget->useSoftFloat()) {
// Since AVX is a superset of SSE3, only check for SSE here.
if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
Expand All @@ -238,14 +237,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// the optimal thing for SSE vs. the default expansion in the legalizer.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
else
// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
// With SSE3 we can use fisttpll to convert to a signed i64; without
// SSE, we're stuck with a fistpll.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
}

if (isTargetFTOL()) {
// Use the _ftol2 runtime function, which has a pseudo-instruction
// to handle its weird calling convention.
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
}

Expand Down Expand Up @@ -1330,13 +1326,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
// FIXME: [US]INT_TO_FP are not legal for f80.
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
if (Subtarget->is64Bit()) {
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
}
Expand Down Expand Up @@ -12334,15 +12327,45 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
DAG.getIntPtrConstant(0, dl));
}

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an f16 source (which needs to be promoted to f32),
// just return an <SDValue(), SDValue()> pair.
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
// to i16, i32 or i64, and we lower it to a legal sequence.
// If lowered to the final integer result we return a <result, SDValue()> pair.
// Otherwise we lower it to a sequence ending with a FIST, return a
// <FIST, StackSlot> pair, and the caller is responsible for loading
// the final integer result from StackSlot.
std::pair<SDValue,SDValue>
X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool IsSigned, bool IsReplace) const {
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool IsSigned, bool IsReplace) const {
SDLoc DL(Op);

EVT DstTy = Op.getValueType();
EVT TheVT = Op.getOperand(0).getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());

if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
if (TheVT == MVT::f16)
// We need to promote the f16 to f32 before using the lowering
// in this routine.
return std::make_pair(SDValue(), SDValue());

assert((TheVT == MVT::f32 ||
TheVT == MVT::f64 ||
TheVT == MVT::f80) &&
"Unexpected FP operand type in FP_TO_INTHelper");

// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned &&
DstTy == MVT::i64 &&
(!Subtarget->is64Bit() ||
!isScalarFPTypeInSSEReg(TheVT));

if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
DstTy = MVT::i64;
}
Expand All @@ -12360,27 +12383,72 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
return std::make_pair(SDValue(), SDValue());

// We lower FP->int64 either into FISTP64 followed by a load from a temporary
// stack slot, or into the FTOL runtime function.
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getSizeInBits()/8;
int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

unsigned Opc;
if (!IsSigned && isIntegerTypeFTOL(DstTy))
Opc = X86ISD::WIN_FTOL;
else
switch (DstTy.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
}
switch (DstTy.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
}

SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
EVT TheVT = Op.getOperand(0).getValueType();
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

if (UnsignedFixup) {
//
// Conversion to unsigned i64 is implemented with a select,
// depending on whether the source value fits in the range
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
//
// Being a power of 2, Thresh is exactly representable in all FP formats.
// For X87 we'd like to use the smallest FP type for this constant, but
// for DAG type consistency we have to match the FP operand type.

APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
&LosesInfo);
else if (TheVT == MVT::f80)
Status = Thresh.convert(APFloat::x87DoubleExtended,
APFloat::rmNearestTiesToEven, &LosesInfo);

assert(Status == APFloat::opOK && !LosesInfo &&
"FP conversion should have been exact");

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

SDValue Cmp = DAG.getSetCC(DL,
getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(0x80000000, DL, MVT::i32));
SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
}

// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
Expand All @@ -12406,25 +12474,49 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
MachineMemOperand::MOStore, MemSize, MemSize);

if (Opc != X86ISD::WIN_FTOL) {
if (UnsignedFixup) {

// Insert the FIST, load its result as two i32's,
// and XOR the high i32 with Adjust.

SDValue FistOps[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
FistOps, DstTy, MMO);

SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
MachinePointerInfo(),
false, false, false, 0);
SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
DAG.getConstant(4, DL, PtrVT));

SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
MachinePointerInfo(),
false, false, false, 0);
High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

if (Subtarget->is64Bit()) {
// Join High32 and Low32 into a 64-bit result.
// (High32 << 32) | Low32
Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
DAG.getConstant(32, DL, MVT::i8));
SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
return std::make_pair(Result, SDValue());
}

SDValue ResultOps[] = { Low32, High32 };

SDValue pair = IsReplace
? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
: DAG.getMergeValues(ResultOps, DL);
return std::make_pair(pair, SDValue());
} else {
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
Ops, DstTy, MMO);
return std::make_pair(FIST, StackSlot);
} else {
SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
DAG.getVTList(MVT::Other, MVT::Glue),
Chain, Value);
SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
MVT::i32, ftol.getValue(1));
SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
MVT::i32, eax.getValue(2));
SDValue Ops[] = { eax, edx };
SDValue pair = IsReplace
? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
: DAG.getMergeValues(Ops, DL);
return std::make_pair(pair, SDValue());
}
}

Expand Down Expand Up @@ -12688,7 +12780,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
/*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
if (!FIST.getNode()) return Op;
if (!FIST.getNode())
return Op;

if (StackSlot.getNode())
// Load the result.
Expand All @@ -12705,7 +12798,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
/*IsSigned=*/ false, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
assert(FIST.getNode() && "Unexpected failure");
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
if (!FIST.getNode())
return Op;

if (StackSlot.getNode())
// Load the result.
Expand Down Expand Up @@ -18885,17 +18980,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_TO_SINT:
// FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert
// (FP_TO_SINT (load f16)) to FP_TO_INT*.
if (N->getOperand(0).getValueType() == MVT::f16)
break;
// fallthrough
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
return;

std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
Expand Down Expand Up @@ -26507,10 +26594,6 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}

bool X86TargetLowering::isTargetFTOL() const {
return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
}

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
// Integer division on x86 is expensive. However, when aggressively optimizing
// for code size, we prefer to use a div instruction, as it is usually smaller
Expand Down
9 changes: 0 additions & 9 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -856,15 +856,6 @@ namespace llvm {
(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
}

/// Return true if the target uses the MSVC _ftol2 routine for fptoui.
bool isTargetFTOL() const;

/// Return true if the MSVC _ftol2 routine should be used for fptoui to the
/// given type.
bool isIntegerTypeFTOL(EVT VT) const {
return isTargetFTOL() && VT == MVT::i64;
}

/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pr17631.ll
Expand Up @@ -30,5 +30,5 @@ define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {

; CHECK: foo
; CHECK-NOT: vzeroupper
; CHECK: _ftol2
; CHECK: {{cvtt|fist}}
; CHECK: ret

0 comments on commit 8515893

Please sign in to comment.