Skip to content

Commit

Permalink
AMDGPU: Use correct lowering for llvm.log2.f32
Browse files Browse the repository at this point in the history
We previously directly codegened to v_log_f32, which is broken for
denormals. The lowering isn't complicated, you simply need to scale
denormal inputs and adjust the result. Note log and log10 are still
not accurate enough, and will be fixed separately.
  • Loading branch information
arsenm committed Jun 23, 2023
1 parent 813f6a4 commit 89ccfa1
Show file tree
Hide file tree
Showing 19 changed files with 6,570 additions and 1,551 deletions.
9 changes: 9 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.

llvm.amdgcn.exp2 Provides direct access to v_exp_f32 and v_exp_f16
(on targets with half support). Performs exp2 function.

:ref:`llvm.log2 <int_log2>` Implemented for float and half (and vectors of float or
half). Not implemented for double. Hardware provides
1ULP accuracy for float, and 0.51ULP for half. Float
instruction does not natively support denormal
inputs. Backend will optimize out denormal scaling if
marked with the :ref:`afn <fastmath_afn>` flag.


========================================= ==========================================================

.. TODO::
Expand Down
5 changes: 5 additions & 0 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3431,6 +3431,8 @@ floating-point transformations.
to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not
be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations.

.. _fastmath_afn:

``afn``
Approximate functions - Allow substitution of approximate calculations for
functions (sin, log, sqrt, etc). See floating-point intrinsic definitions
Expand Down Expand Up @@ -14830,6 +14832,9 @@ trapping or setting ``errno``.
When specified with the fast-math-flag 'afn', the result may be approximated
using a less accurate calculation.


.. _int_log2:

'``llvm.log2.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ Changes to the AMDGPU Backend
* Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to
v_exp_f32.

* llvm.log2.f32 is now lowered accurately. Use llvm.amdgcn.log.f32 to
access the old behavior.

Changes to the ARM Backend
--------------------------
Expand Down
86 changes: 82 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

// Library functions. These default to Expand, but we have instructions
// for them.
setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
ISD::FMAXNUM},
setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
MVT::f32, Legal);

setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);

setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
Expand All @@ -345,8 +345,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

if (Subtarget->has16BitInsts())
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
else
else {
setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
setOperationAction(ISD::FLOG2, MVT::f16, Custom);
}

// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
Expand Down Expand Up @@ -1304,6 +1306,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
return LowerFROUNDEVEN(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG2:
return LowerFLOG2(Op, DAG);
case ISD::FLOG:
return LowerFLOG(Op, DAG, numbers::ln2);
case ISD::FLOG10:
Expand Down Expand Up @@ -1338,6 +1342,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
// nothing here and let the illegal result integer be handled normally.
return;
case ISD::FLOG2:
if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
Results.push_back(Lowered);
return;
default:
return;
}
Expand Down Expand Up @@ -2425,6 +2433,76 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}

/// Return true if it's known that \p Src can never be an f32 denormal value.
static bool valueIsKnownNeverF32Denorm(SDValue Src) {
switch (Src.getOpcode()) {
case ISD::FP_EXTEND:
return Src.getOperand(0).getValueType() == MVT::f16;
case ISD::FP16_TO_FP:
return true;
default:
return false;
}

llvm_unreachable("covered opcode switch");
}

SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
// If we have to handle denormals, scale up the input and adjust the result.

// scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)

SDLoc SL(Op);
EVT VT = Op.getValueType();
SDValue Src = Op.getOperand(0);
SDNodeFlags Flags = Op->getFlags();

if (VT == MVT::f16) {
// Nothing in half is a denormal when promoted to f32.
assert(!Subtarget->has16BitInsts());
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
DAG.getTargetConstant(0, SL, MVT::i32), Flags);
}

bool NeedDenormHandling =
!Flags.hasApproximateFuncs() && !DAG.getTarget().Options.UnsafeFPMath &&
!DAG.getTarget().Options.ApproxFuncFPMath &&
!valueIsKnownNeverF32Denorm(Src) &&
DAG.getDenormalMode(VT).Input != DenormalMode::PreserveSign;

if (!NeedDenormHandling)
return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);

const fltSemantics &Semantics = APFloat::IEEEsingle();
SDValue SmallestNormal =
DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);

// Want to scale denormals up, but negatives and 0 work just as well on the
// scaled path.
SDValue IsLtSmallestNormal = DAG.getSetCC(
SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
SmallestNormal, ISD::SETOLT);

SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
SDValue One = DAG.getConstantFP(1.0, SL, VT);
SDValue ScaleFactor =
DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);

SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);

SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);

SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
SDValue ResultOffset =
DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
}

SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
double Log2BaseInverted) const {
EVT VT = Op.getValueType();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
double Log2BaseInverted) const;
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,7 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
(AMDGPUfract_impl node:$src)]>;
def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
(AMDGPUlog_impl node:$src),
(flog2 node:$src)]>;
(AMDGPUlog_impl node:$src)]>;
def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
(flog2 node:$src)]>;

Expand Down
90 changes: 89 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);

// FIXME: fpow has a selection pattern that should move to custom lowering.
auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
auto &Exp2Ops = getActionDefinitionsBuilder(G_FEXP2);
if (ST.has16BitInsts())
Exp2Ops.legalFor({S32, S16});
else
Expand All @@ -1130,6 +1130,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, MinScalarFPTy, S32)
.lower();

auto &Log2Ops = getActionDefinitionsBuilder(G_FLOG2);
Log2Ops.customFor({S32});
if (ST.has16BitInsts())
Log2Ops.legalFor({S16});
else
Log2Ops.customFor({S16});
Log2Ops.scalarize(0)
.lower();

// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
Expand Down Expand Up @@ -1986,6 +1995,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeSignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
case TargetOpcode::G_FLOG2:
return legalizeFlog2(MI, B);
case TargetOpcode::G_FLOG:
return legalizeFlog(MI, B, numbers::ln2);
case TargetOpcode::G_FLOG10:
Expand Down Expand Up @@ -2978,6 +2989,83 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
return true;
}

/// Return true if it's known that \p Src can never be an f32 denormal value.
static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
Register Src) {
Register ExtSrc;
if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
return MRI.getType(ExtSrc) == LLT::scalar(16);
return false;
}

bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
MachineIRBuilder &B) const {
// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
// If we have to handle denormals, scale up the input and adjust the result.

// scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)

Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();

const MachineFunction &MF = B.getMF();

if (Ty == LLT::scalar(16)) {
const LLT F32 = LLT::scalar(32);
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
.addUse(Ext.getReg(0))
.setMIFlags(Flags);
B.buildFPTrunc(Dst, Log2, Flags);
MI.eraseFromParent();
return true;
}

assert(Ty == LLT::scalar(32));

const fltSemantics &Flt = APFloat::IEEEsingle();

bool NeedDenormHandling =
!MI.getFlag(MachineInstr::FmAfn) &&
!MF.getTarget().Options.UnsafeFPMath &&
!MF.getTarget().Options.ApproxFuncFPMath &&
!valueIsKnownNeverF32Denorm(*B.getMRI(), Src) &&
MF.getDenormalMode(Flt).Input != DenormalMode::PreserveSign;

if (!NeedDenormHandling) {
B.buildIntrinsic(Intrinsic::amdgcn_log, ArrayRef<Register>{Dst}, false)
.addUse(Src)
.setMIFlags(Flags);
MI.eraseFromParent();
return true;
}

auto SmallestNormal =
B.buildFConstant(Ty, APFloat::getSmallestNormalized(Flt));
auto IsDenormOrZero =
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);

auto Scale32 = B.buildFConstant(Ty, 0x1.0p+32);
auto One = B.buildFConstant(Ty, 1.0);
auto ScaleFactor = B.buildSelect(Ty, IsDenormOrZero, Scale32, One, Flags);
auto ScaledInput = B.buildFMul(Ty, Src, ScaleFactor, Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
.addUse(ScaledInput.getReg(0))
.setMIFlags(Flags);

auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
auto Zero = B.buildFConstant(Ty, 0.0);
auto ResultOffset = B.buildSelect(Ty, IsDenormOrZero, ThirtyTwo, Zero, Flags);
B.buildFSub(Dst, Log2, ResultOffset, Flags);

MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeFlog(
MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
Register Dst = MI.getOperand(0).getReg();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {

bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
double Log2BaseInverted) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5268,6 +5268,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
}
}
Expand Down

0 comments on commit 89ccfa1

Please sign in to comment.