Skip to content

Commit

Permalink
AMDGPU: Handle unsafe exp.f32 with denormal handling
Browse files Browse the repository at this point in the history
I somehow missed this path when adding the new expansions. Saves a lot
of instructions for afn + IEEE.

https://reviews.llvm.org/D157867
  • Loading branch information
arsenm committed Aug 14, 2023
1 parent d45022b commit 1faa479
Show file tree
Hide file tree
Showing 3 changed files with 458 additions and 942 deletions.
40 changes: 33 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2760,14 +2760,40 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
}

SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
// exp2(M_LOG2E_F * f);
EVT VT = Op.getValueType();
const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
EVT VT = X.getValueType();
const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);

if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
// exp2(M_LOG2E_F * f);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT,
Mul, Flags);
}

EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);

SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);

SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);

SDValue AdjustedX =
DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);

SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);

SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);

SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
SDValue AdjustedResult =
DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);

return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
Flags);
}

Expand Down Expand Up @@ -2800,7 +2826,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {

// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
if (allowApproxFunc(DAG, Flags)) {
assert(!IsExp10 && "todo exp10 support");
return lowerFEXPUnsafe(X, SL, DAG, Flags);
}
Expand Down
40 changes: 31 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3304,20 +3304,42 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
}

bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
Register Src,
unsigned Flags) const {
Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
auto K = B.buildFConstant(Ty, numbers::log2e);
auto Mul = B.buildFMul(Ty, Src, K, Flags);
LLT F32 = LLT::scalar(32);

if (Ty == LLT::scalar(32)) {
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
auto Mul = B.buildFMul(Ty, X, Log2E, Flags);

if (Ty == F32) {
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
.addUse(Mul.getReg(0))
.setMIFlags(Flags);
} else {
B.buildFExp2(Dst, Mul.getReg(0), Flags);
} else {
B.buildFExp2(Dst, Mul.getReg(0), Flags);
}

return true;
}

auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
auto NeedsScaling =
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);

auto Log2E = B.buildFConstant(Ty, numbers::log2e);
auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);

auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
.addUse(ExpInput.getReg(0))
.setMIFlags(Flags);

auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
return true;
}

Expand Down Expand Up @@ -3358,7 +3380,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,

// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
if (allowApproxFunc(MF, Flags)) {
legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
Expand Down

0 comments on commit 1faa479

Please sign in to comment.