Skip to content

Commit

Permalink
AMDGPU: Always use v_rcp_f16 and v_rsq_f16
Browse files Browse the repository at this point in the history
These inherited the fast math checks from f32, but the manual suggests
these should be accurate enough for unconditional use. The definition
of correctly rounded is 0.5ulp, but the manual says "0.51ulp". I've
been a bit nervous about changing this as the OpenCL conformance test
does not cover half. Brute force produces identical values compared to
a reference host implementation for all values.
  • Loading branch information
arsenm committed Jul 5, 2023
1 parent 59c311c commit 9c82dc6
Show file tree
Hide file tree
Showing 7 changed files with 324 additions and 624 deletions.
16 changes: 5 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -835,17 +835,12 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
//
// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {

Type *Ty = FDiv.getType()->getScalarType();

// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
// expansion around them in codegen.
if (Ty->isDoubleTy())
if (!Ty->isFloatTy())
return false;

// No intrinsic for fdiv16 if target does not support f16.
if (Ty->isHalfTy() && !ST->has16BitInsts())
return false;
// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
// expansion around them in codegen. f16 is good enough to always use.

const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
const float ReqdAccuracy = FPOp->getFPAccuracy();
Expand All @@ -854,11 +849,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
FastMathFlags FMF = FPOp->getFastMathFlags();
const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();

// rcp_f16 is accurate for !fpmath >= 1.0ulp.
// rcp_f16 is accurate to 0.51 ulp.
// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
// rcp_f64 is never accurate.
const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
(Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
const bool RcpIsAccurate = !HasFP32Denormals && ReqdAccuracy >= 1.0f;

IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(FMF);
Expand Down
25 changes: 20 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4246,13 +4246,20 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
LLT ResTy = MRI.getType(Res);

const MachineFunction &MF = B.getMF();
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
MI.getFlag(MachineInstr::FmAfn);

if (!AllowInaccurateRcp)
return false;
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
MF.getTarget().Options.UnsafeFPMath;

if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
return false;

// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
// use it as long as we aren't trying to use denormals.
//
// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.

// 1 / x -> RCP(x)
if (CLHS->isExactlyValue(1.0)) {
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
Expand All @@ -4263,6 +4270,8 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
return true;
}

// TODO: Match rsq

// -1 / x -> RCP( FNEG(x) )
if (CLHS->isExactlyValue(-1.0)) {
auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
Expand All @@ -4275,6 +4284,12 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
}
}

// For f16 require arcp only.
// For f32 require afn+arcp.
if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
!MI.getFlag(MachineInstr::FmArcp)))
return false;

// x / y -> x * (1.0 / y)
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
.addUse(RHS)
Expand Down
23 changes: 16 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9130,26 +9130,30 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();

bool AllowInaccurateRcp = Flags.hasApproximateFuncs();

// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
if (!AllowInaccurateRcp)
return SDValue();
bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
DAG.getTarget().Options.UnsafeFPMath;

if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
// f16 is always accurate enough
if (!AllowInaccurateRcp && VT != MVT::f16)
return SDValue();

if (CLHS->isExactlyValue(1.0)) {
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
// use it as long as we aren't trying to use denormals.
//
// v_rcp_f16 and v_rsq_f16 DO support denormals.
// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.

// 1.0 / sqrt(x) -> rsq(x)

// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.

// XXX - do we need afn for this or is arcp sufficent?
if (RHS.getOpcode() == ISD::FSQRT)
return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));

Expand All @@ -9165,6 +9169,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}

// For f16 require arcp only.
// For f32 require afn+arcp.
if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
return SDValue();

// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Expand Down
Loading

0 comments on commit 9c82dc6

Please sign in to comment.