AMDGPU: Always use v_rcp_f16 and v_rsq_f16

These inherited the fast math checks from f32, but the manual suggests these should be accurate enough for unconditional use. The definition of correctly rounded is 0.5ulp, but the manual says "0.51ulp". I've been a bit nervous about changing this as the OpenCL conformance test does not cover half. Brute force produces identical values compared to a reference host implementation for all values.
llvm · Jul 5, 2023 · 9c82dc6 · 9c82dc6
1 parent 59c311c
commit 9c82dc6
Show file tree

Hide file tree

Showing 7 changed files with 324 additions and 624 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -835,17 +835,12 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
 //
 // NOTE: rcp is the preference in cases that both are legal.
 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
-
   Type *Ty = FDiv.getType()->getScalarType();
-
-  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
-  // expansion around them in codegen.
-  if (Ty->isDoubleTy())
+  if (!Ty->isFloatTy())
     return false;
 
-  // No intrinsic for fdiv16 if target does not support f16.
-  if (Ty->isHalfTy() && !ST->has16BitInsts())
-    return false;
+  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
+  // expansion around them in codegen. f16 is good enough to always use.
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
   const float ReqdAccuracy =  FPOp->getFPAccuracy();
@@ -854,11 +849,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
   FastMathFlags FMF = FPOp->getFastMathFlags();
   const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
 
-  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+  // rcp_f16 is accurate to 0.51 ulp.
   // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
   // rcp_f64 is never accurate.
-  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
-            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
+  const bool RcpIsAccurate = !HasFP32Denormals && ReqdAccuracy >= 1.0f;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4246,13 +4246,20 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
   LLT ResTy = MRI.getType(Res);
 
   const MachineFunction &MF = B.getMF();
-  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
-                            MI.getFlag(MachineInstr::FmAfn);
-
-  if (!AllowInaccurateRcp)
-    return false;
+  bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
+                            MF.getTarget().Options.UnsafeFPMath;
 
   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
+    if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
+      return false;
+
+    // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+    // the CI documentation has a worst case error of 1 ulp.
+    // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+    // use it as long as we aren't trying to use denormals.
+    //
+    // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
+
     // 1 / x -> RCP(x)
     if (CLHS->isExactlyValue(1.0)) {
       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
@@ -4263,6 +4270,8 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
       return true;
     }
 
+    // TODO: Match rsq
+
     // -1 / x -> RCP( FNEG(x) )
     if (CLHS->isExactlyValue(-1.0)) {
       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
@@ -4275,6 +4284,12 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
     }
   }
 
+  // For f16 require arcp only.
+  // For f32 require afn+arcp.
+  if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
+                              !MI.getFlag(MachineInstr::FmArcp)))
+    return false;
+
   // x / y -> x * (1.0 / y)
   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
     .addUse(RHS)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9130,26 +9130,30 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
 
-  bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
-
-  // Without !fpmath accuracy information, we can't do more because we don't
-  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
-  if (!AllowInaccurateRcp)
-    return SDValue();
+  bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
+                            DAG.getTarget().Options.UnsafeFPMath;
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+    // Without !fpmath accuracy information, we can't do more because we don't
+    // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+    // f16 is always accurate enough
+    if (!AllowInaccurateRcp && VT != MVT::f16)
+      return SDValue();
+
     if (CLHS->isExactlyValue(1.0)) {
       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
       // the CI documentation has a worst case error of 1 ulp.
       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
       // use it as long as we aren't trying to use denormals.
       //
-      // v_rcp_f16 and v_rsq_f16 DO support denormals.
+      // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
 
       // 1.0 / sqrt(x) -> rsq(x)
 
       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
       // error seems really high at 2^29 ULP.
+
+      // XXX - do we need afn for this or is arcp sufficent?
       if (RHS.getOpcode() == ISD::FSQRT)
         return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
 
@@ -9165,6 +9169,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
     }
   }
 
+  // For f16 require arcp only.
+  // For f32 require afn+arcp.
+  if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+    return SDValue();
+
   // Turn into multiply by the reciprocal.
   // x / y -> x * (1.0 / y)
   SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);