68 changes: 33 additions & 35 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);

static cl::opt<int> ReciprocalEstimateRefinementSteps(
"x86-recip-refinement-steps", cl::init(1),
cl::desc("Specify the number of Newton-Raphson iterations applied to the "
"result of the hardware reciprocal estimate instruction."),
cl::NotHidden);

// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
Expand Down Expand Up @@ -13006,58 +13000,62 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
// FIXME: We should use instruction latency models to calculate the cost of
// each potential sequence, but this is very hard to do reliably because
// at least Intel's Core* chips have variable timing based on the number of
// significant digits in the divisor and/or sqrt operand.
if (!Subtarget->useSqrtEst())
return SDValue();

EVT VT = Op.getValueType();
const char *RecipOp;

// SSE1 has rsqrtss and rsqrtps.
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
(Subtarget->hasAVX() && VT == MVT::v8f32)) {
RefinementSteps = 1;
UseOneConstNR = false;
return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
}
return SDValue();
if (VT == MVT::f32 && Subtarget->hasSSE1())
RecipOp = "sqrtf";
else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
(VT == MVT::v8f32 && Subtarget->hasAVX()))
RecipOp = "vec-sqrtf";
else
return SDValue();

TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
if (!Recips.isEnabled(RecipOp))
return SDValue();

RefinementSteps = Recips.getRefinementSteps(RecipOp);
UseOneConstNR = false;
return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
}

/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const {
// FIXME: We should use instruction latency models to calculate the cost of
// each potential sequence, but this is very hard to do reliably because
// at least Intel's Core* chips have variable timing based on the number of
// significant digits in the divisor.
if (!Subtarget->useReciprocalEst())
return SDValue();

EVT VT = Op.getValueType();

const char *RecipOp;

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// reciprocal estimate with refinement on x86 prior to FMA requires
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
(Subtarget->hasAVX() && VT == MVT::v8f32)) {
RefinementSteps = ReciprocalEstimateRefinementSteps;
return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
}
return SDValue();
if (VT == MVT::f32 && Subtarget->hasSSE1())
RecipOp = "divf";
else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
(VT == MVT::v8f32 && Subtarget->hasAVX()))
RecipOp = "vec-divf";
else
return SDValue();

TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
if (!Recips.isEnabled(RecipOp))
return SDValue();

RefinementSteps = Recips.getRefinementSteps(RecipOp);
return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
}

/// If we have at least two divisions that use the same divisor, convert to
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/X86/X86Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,6 @@ void X86Subtarget::initializeEnvironment() {
LEAUsesAG = false;
SlowLEA = false;
SlowIncDec = false;
UseSqrtEst = false;
UseReciprocalEst = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
Expand Down
12 changes: 0 additions & 12 deletions llvm/lib/Target/X86/X86Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;

/// Use the RSQRT* instructions to optimize square root calculations.
/// For this to be profitable, the cost of FSQRT and FDIV must be
/// substantially higher than normal FP ops like FADD and FMUL.
bool UseSqrtEst;

/// Use the RCP* instructions to optimize FP division calculations.
/// For this to be profitable, the cost of FDIV must be
/// substantially higher than normal FP ops like FADD and FMUL.
bool UseReciprocalEst;

/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;

Expand Down Expand Up @@ -380,8 +370,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool useSqrtEst() const { return UseSqrtEst; }
bool useReciprocalEst() const { return UseReciprocalEst; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/X86/X86TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
if (Subtarget.isTargetWin64())
this->Options.TrapUnreachable = true;

// TODO: By default, all reciprocal estimate operations are off because
// that matches the behavior before TargetRecip was added (except for btver2
// which used subtarget features to enable this type of codegen).
// We should change this to match GCC behavior where everything but
// scalar division estimates are turned on by default with -ffast-math.
this->Options.Reciprocals.setDefaults("all", false, 1);

initAsmInfo();
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/recip-fastmath.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE

; If the target's divss/divps instructions are substantially
; slower than rcpss/rcpps with a Newton-Raphson refinement,
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/sqrt-fastmath.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE

declare double @__sqrt_finite(double) #0
declare float @__sqrtf_finite(float) #0
Expand Down