AMDGPU: Use correct lowering for llvm.log2.f32

We previously directly codegened to v_log_f32, which is broken for denormals. The lowering isn't complicated, you simply need to scale denormal inputs and adjust the result. Note log and log10 are still not accurate enough, and will be fixed separately.
llvm · Jun 23, 2023 · 89ccfa1 · 89ccfa1
1 parent 813f6a4
commit 89ccfa1
Show file tree

Hide file tree

Showing 19 changed files with 6,570 additions and 1,551 deletions.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -956,6 +956,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   llvm.amdgcn.exp2                           Provides direct access to v_exp_f32 and v_exp_f16
                                              (on targets with half support). Performs exp2 function.
+
+  :ref:`llvm.log2 <int_log2>`                Implemented for float and half (and vectors of float or
+                                             half). Not implemented for double. Hardware provides
+                                             1ULP accuracy for float, and 0.51ULP for half. Float
+                                             instruction does not natively support denormal
+                                             inputs. Backend will optimize out denormal scaling if
+                                             marked with the :ref:`afn <fastmath_afn>` flag.
+
+
   =========================================  ==========================================================
 
 .. TODO::

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -3431,6 +3431,8 @@ floating-point transformations.
    to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not
    be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations.
 
+.. _fastmath_afn:
+
 ``afn``
    Approximate functions - Allow substitution of approximate calculations for
    functions (sin, log, sqrt, etc). See floating-point intrinsic definitions
@@ -14830,6 +14832,9 @@ trapping or setting ``errno``.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+
+.. _int_log2:
+
 '``llvm.log2.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
@@ -143,6 +143,8 @@ Changes to the AMDGPU Backend
 * Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to
   v_exp_f32.
 
+* llvm.log2.f32 is now lowered accurately. Use llvm.amdgcn.log.f32 to
+  access the old behavior.
 
 Changes to the ARM Backend
 --------------------------

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -328,11 +328,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   // Library functions.  These default to Expand, but we have instructions
   // for them.
-  setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
-                      ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
-                      ISD::FMAXNUM},
+  setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
+                      ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
                      MVT::f32, Legal);
 
+  setOperationAction(ISD::FLOG2, MVT::f32, Custom);
   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
 
   setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
@@ -345,8 +345,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   if (Subtarget->has16BitInsts())
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
-  else
+  else {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
+    setOperationAction(ISD::FLOG2, MVT::f16, Custom);
+  }
 
   // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
   // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -1304,6 +1306,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
     return LowerFROUNDEVEN(Op, DAG);
   case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+  case ISD::FLOG2:
+    return LowerFLOG2(Op, DAG);
   case ISD::FLOG:
     return LowerFLOG(Op, DAG, numbers::ln2);
   case ISD::FLOG10:
@@ -1338,6 +1342,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
+  case ISD::FLOG2:
+    if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
+      Results.push_back(Lowered);
+    return;
   default:
     return;
   }
@@ -2425,6 +2433,76 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+/// Return true if it's known that \p Src can never be an f32 denormal value.
+static bool valueIsKnownNeverF32Denorm(SDValue Src) {
+  switch (Src.getOpcode()) {
+  case ISD::FP_EXTEND:
+    return Src.getOperand(0).getValueType() == MVT::f16;
+  case ISD::FP16_TO_FP:
+    return true;
+  default:
+    return false;
+  }
+
+  llvm_unreachable("covered opcode switch");
+}
+
+SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
+  // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
+  // If we have to handle denormals, scale up the input and adjust the result.
+
+  // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
+  // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
+
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Src = Op.getOperand(0);
+  SDNodeFlags Flags = Op->getFlags();
+
+  if (VT == MVT::f16) {
+    // Nothing in half is a denormal when promoted to f32.
+    assert(!Subtarget->has16BitInsts());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
+    SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
+    return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
+                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+  }
+
+  bool NeedDenormHandling =
+      !Flags.hasApproximateFuncs() && !DAG.getTarget().Options.UnsafeFPMath &&
+      !DAG.getTarget().Options.ApproxFuncFPMath &&
+      !valueIsKnownNeverF32Denorm(Src) &&
+      DAG.getDenormalMode(VT).Input != DenormalMode::PreserveSign;
+
+  if (!NeedDenormHandling)
+    return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
+
+  const fltSemantics &Semantics = APFloat::IEEEsingle();
+  SDValue SmallestNormal =
+      DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
+
+  // Want to scale denormals up, but negatives and 0 work just as well on the
+  // scaled path.
+  SDValue IsLtSmallestNormal = DAG.getSetCC(
+      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
+      SmallestNormal, ISD::SETOLT);
+
+  SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
+  SDValue One = DAG.getConstantFP(1.0, SL, VT);
+  SDValue ScaleFactor =
+      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
+
+  SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
+
+  SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
+
+  SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
+  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+  SDValue ResultOffset =
+      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
+  return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
+}
+
 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
                                         double Log2BaseInverted) const {
   EVT VT = Op.getValueType();

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -60,6 +60,7 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
                     double Log2BaseInverted) const;
   SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -392,8 +392,7 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
 def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
                                              (AMDGPUfract_impl node:$src)]>;
 def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
-                                           (AMDGPUlog_impl node:$src),
-                                           (flog2 node:$src)]>;
+                                           (AMDGPUlog_impl node:$src)]>;
 def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
                                               (flog2 node:$src)]>;
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1110,7 +1110,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0);
 
   // FIXME: fpow has a selection pattern that should move to custom lowering.
-  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
+  auto &Exp2Ops = getActionDefinitionsBuilder(G_FEXP2);
   if (ST.has16BitInsts())
     Exp2Ops.legalFor({S32, S16});
   else
@@ -1130,6 +1130,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, MinScalarFPTy, S32)
     .lower();
 
+  auto &Log2Ops = getActionDefinitionsBuilder(G_FLOG2);
+  Log2Ops.customFor({S32});
+  if (ST.has16BitInsts())
+    Log2Ops.legalFor({S16});
+  else
+    Log2Ops.customFor({S16});
+  Log2Ops.scalarize(0)
+    .lower();
+
   // The 64-bit versions produce 32-bit results, but only on the SALU.
   getActionDefinitionsBuilder(G_CTPOP)
     .legalFor({{S32, S32}, {S32, S64}})
@@ -1986,6 +1995,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeSignedDIV_REM(MI, MRI, B);
   case TargetOpcode::G_ATOMIC_CMPXCHG:
     return legalizeAtomicCmpXChg(MI, MRI, B);
+  case TargetOpcode::G_FLOG2:
+    return legalizeFlog2(MI, B);
   case TargetOpcode::G_FLOG:
     return legalizeFlog(MI, B, numbers::ln2);
   case TargetOpcode::G_FLOG10:
@@ -2978,6 +2989,83 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
   return true;
 }
 
+/// Return true if it's known that \p Src can never be an f32 denormal value.
+static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
+                                       Register Src) {
+  Register ExtSrc;
+  if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
+    return MRI.getType(ExtSrc) == LLT::scalar(16);
+  return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
+                                        MachineIRBuilder &B) const {
+  // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
+  // If we have to handle denormals, scale up the input and adjust the result.
+
+  // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
+  // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT Ty = B.getMRI()->getType(Dst);
+  unsigned Flags = MI.getFlags();
+
+  const MachineFunction &MF = B.getMF();
+
+  if (Ty == LLT::scalar(16)) {
+    const LLT F32 = LLT::scalar(32);
+    // Nothing in half is a denormal when promoted to f32.
+    auto Ext = B.buildFPExt(F32, Src, Flags);
+    auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
+      .addUse(Ext.getReg(0))
+      .setMIFlags(Flags);
+    B.buildFPTrunc(Dst, Log2, Flags);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  assert(Ty == LLT::scalar(32));
+
+  const fltSemantics &Flt = APFloat::IEEEsingle();
+
+  bool NeedDenormHandling =
+      !MI.getFlag(MachineInstr::FmAfn) &&
+      !MF.getTarget().Options.UnsafeFPMath &&
+      !MF.getTarget().Options.ApproxFuncFPMath &&
+      !valueIsKnownNeverF32Denorm(*B.getMRI(), Src) &&
+      MF.getDenormalMode(Flt).Input != DenormalMode::PreserveSign;
+
+  if (!NeedDenormHandling) {
+    B.buildIntrinsic(Intrinsic::amdgcn_log, ArrayRef<Register>{Dst}, false)
+      .addUse(Src)
+      .setMIFlags(Flags);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  auto SmallestNormal =
+      B.buildFConstant(Ty, APFloat::getSmallestNormalized(Flt));
+  auto IsDenormOrZero =
+      B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
+
+  auto Scale32 = B.buildFConstant(Ty, 0x1.0p+32);
+  auto One = B.buildFConstant(Ty, 1.0);
+  auto ScaleFactor = B.buildSelect(Ty, IsDenormOrZero, Scale32, One, Flags);
+  auto ScaledInput = B.buildFMul(Ty, Src, ScaleFactor, Flags);
+  auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+    .addUse(ScaledInput.getReg(0))
+    .setMIFlags(Flags);
+
+  auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
+  auto Zero = B.buildFConstant(Ty, 0.0);
+  auto ResultOffset = B.buildSelect(Ty, IsDenormOrZero, ThirtyTwo, Zero, Flags);
+  B.buildFSub(Dst, Log2, ResultOffset, Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFlog(
   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
   Register Dst = MI.getOperand(0).getReg();

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -78,6 +78,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineIRBuilder &B) const;
+  bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
                     double Log2BaseInverted) const;
   bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5268,6 +5268,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   default:
+    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     break;
   }
 }