diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index e2eadad7bb4bf..36444a0cdb9e9 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -956,6 +956,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics. llvm.amdgcn.exp2 Provides direct access to v_exp_f32 and v_exp_f16 (on targets with half support). Performs exp2 function. + + :ref:`llvm.log2 ` Implemented for float and half (and vectors of float or + half). Not implemented for double. Hardware provides + 1ULP accuracy for float, and 0.51ULP for half. Float + instruction does not natively support denormal + inputs. Backend will optimize out denormal scaling if + marked with the :ref:`afn ` flag. + + ========================================= ========================================================== .. TODO:: diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3e98b99905f16..014f24925f249 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3431,6 +3431,8 @@ floating-point transformations. to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations. +.. _fastmath_afn: + ``afn`` Approximate functions - Allow substitution of approximate calculations for functions (sin, log, sqrt, etc). See floating-point intrinsic definitions @@ -14830,6 +14832,9 @@ trapping or setting ``errno``. When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. + +.. _int_log2: + '``llvm.log2.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 53c2f3960a0b8..6efc991ff509e 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -143,6 +143,8 @@ Changes to the AMDGPU Backend * Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to v_exp_f32. +* llvm.log2.f32 is now lowered accurately. Use llvm.amdgcn.log.f32 to + access the old behavior. Changes to the ARM Backend -------------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a09604fa1872f..f3cceaa1d1f5a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -328,11 +328,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. - setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS, - ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, - ISD::FMAXNUM}, + setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FABS, ISD::FFLOOR, + ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom); @@ -345,8 +345,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else + else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); + setOperationAction(ISD::FLOG2, MVT::f16, Custom); + } // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches // scalarization code. Can be removed when IS_FPCLASS expand isn't called by @@ -1304,6 +1306,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, return LowerFROUNDEVEN(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::FLOG2: + return LowerFLOG2(Op, DAG); case ISD::FLOG: return LowerFLOG(Op, DAG, numbers::ln2); case ISD::FLOG10: @@ -1338,6 +1342,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; + case ISD::FLOG2: + if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; default: return; } @@ -2425,6 +2433,76 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +/// Return true if it's known that \p Src can never be an f32 denormal value. +static bool valueIsKnownNeverF32Denorm(SDValue Src) { + switch (Src.getOpcode()) { + case ISD::FP_EXTEND: + return Src.getOperand(0).getValueType() == MVT::f16; + case ISD::FP16_TO_FP: + return true; + default: + return false; + } + + llvm_unreachable("covered opcode switch"); +} + +SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { + // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) + // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) + + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + SDNodeFlags Flags = Op->getFlags(); + + if (VT == MVT::f16) { + // Nothing in half is a denormal when promoted to f32. + assert(!Subtarget->has16BitInsts()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); + SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); + } + + bool NeedDenormHandling = + !Flags.hasApproximateFuncs() && !DAG.getTarget().Options.UnsafeFPMath && + !DAG.getTarget().Options.ApproxFuncFPMath && + !valueIsKnownNeverF32Denorm(Src) && + DAG.getDenormalMode(VT).Input != DenormalMode::PreserveSign; + + if (!NeedDenormHandling) + return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); + + const fltSemantics &Semantics = APFloat::IEEEsingle(); + SDValue SmallestNormal = + DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); + + // Want to scale denormals up, but negatives and 0 work just as well on the + // scaled path. + SDValue IsLtSmallestNormal = DAG.getSetCC( + SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, + SmallestNormal, ISD::SETOLT); + + SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + SDValue ScaleFactor = + DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); + + SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); + + SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); + + SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); + SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + SDValue ResultOffset = + DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); + return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); +} + SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, double Log2BaseInverted) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 7d3f23b28a50b..f25480906ee52 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -60,6 +60,7 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG, double Log2BaseInverted) const; SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c5a23d2eb5e98..a79ff0b55624c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -392,8 +392,7 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src), def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), (AMDGPUfract_impl node:$src)]>; def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), - (AMDGPUlog_impl node:$src), - (flog2 node:$src)]>; + (AMDGPUlog_impl node:$src)]>; def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), (flog2 node:$src)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f19f2076d1f8b..e468e6285b2b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1110,7 +1110,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // FIXME: fpow has a selection pattern that should move to custom lowering. - auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); + auto &Exp2Ops = getActionDefinitionsBuilder(G_FEXP2); if (ST.has16BitInsts()) Exp2Ops.legalFor({S32, S16}); else @@ -1130,6 +1130,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, MinScalarFPTy, S32) .lower(); + auto &Log2Ops = getActionDefinitionsBuilder(G_FLOG2); + Log2Ops.customFor({S32}); + if (ST.has16BitInsts()) + Log2Ops.legalFor({S16}); + else + Log2Ops.customFor({S16}); + Log2Ops.scalarize(0) + .lower(); + // The 64-bit versions produce 32-bit results, but only on the SALU. getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) @@ -1986,6 +1995,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeSignedDIV_REM(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); + case TargetOpcode::G_FLOG2: + return legalizeFlog2(MI, B); case TargetOpcode::G_FLOG: return legalizeFlog(MI, B, numbers::ln2); case TargetOpcode::G_FLOG10: @@ -2978,6 +2989,83 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( return true; } +/// Return true if it's known that \p Src can never be an f32 denormal value. +static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, + Register Src) { + Register ExtSrc; + if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) + return MRI.getType(ExtSrc) == LLT::scalar(16); + return false; +} + +bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, + MachineIRBuilder &B) const { + // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) + // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Flags = MI.getFlags(); + + const MachineFunction &MF = B.getMF(); + + if (Ty == LLT::scalar(16)) { + const LLT F32 = LLT::scalar(32); + // Nothing in half is a denormal when promoted to f32. + auto Ext = B.buildFPExt(F32, Src, Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(Dst, Log2, Flags); + MI.eraseFromParent(); + return true; + } + + assert(Ty == LLT::scalar(32)); + + const fltSemantics &Flt = APFloat::IEEEsingle(); + + bool NeedDenormHandling = + !MI.getFlag(MachineInstr::FmAfn) && + !MF.getTarget().Options.UnsafeFPMath && + !MF.getTarget().Options.ApproxFuncFPMath && + !valueIsKnownNeverF32Denorm(*B.getMRI(), Src) && + MF.getDenormalMode(Flt).Input != DenormalMode::PreserveSign; + + if (!NeedDenormHandling) { + B.buildIntrinsic(Intrinsic::amdgcn_log, ArrayRef{Dst}, false) + .addUse(Src) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + auto SmallestNormal = + B.buildFConstant(Ty, APFloat::getSmallestNormalized(Flt)); + auto IsDenormOrZero = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); + + auto Scale32 = B.buildFConstant(Ty, 0x1.0p+32); + auto One = B.buildFConstant(Ty, 1.0); + auto ScaleFactor = B.buildSelect(Ty, IsDenormOrZero, Scale32, One, Flags); + auto ScaledInput = B.buildFMul(Ty, Src, ScaleFactor, Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + .addUse(ScaledInput.getReg(0)) + .setMIFlags(Flags); + + auto ThirtyTwo = B.buildFConstant(Ty, 32.0); + auto Zero = B.buildFConstant(Ty, 0.0); + auto ResultOffset = B.buildSelect(Ty, IsDenormOrZero, ThirtyTwo, Zero, Flags); + B.buildFSub(Dst, Log2, ResultOffset, Flags); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFlog( MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { Register Dst = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index c93c66dd5ec5e..f7404849ca92b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -78,6 +78,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e3c63eddb4e7d..5a35cb41fbaf9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5268,6 +5268,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, return; } default: + AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 31a684fa592e1..6bdcf0f7973c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -9,7 +9,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -17,7 +25,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX8-LABEL: v_pow_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -25,7 +41,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX9-LABEL: v_pow_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -34,7 +58,12 @@ define float @v_pow_f32(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -43,10 +72,16 @@ define float @v_pow_f32(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) @@ -57,9 +92,22 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX6-LABEL: v_pow_v2f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v6 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 @@ -68,9 +116,22 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX8-LABEL: v_pow_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GFX8-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX8-NEXT: v_log_f32_e32 v1, v1 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v6 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f32_e32 v1, v1 @@ -79,9 +140,22 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX9-LABEL: v_pow_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX9-NEXT: v_log_f32_e32 v1, v1 +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX9-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 @@ -91,8 +165,18 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_log_f32_e32 v1, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 @@ -103,12 +187,23 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) @@ -647,7 +742,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6-LABEL: v_pow_f32_fabs_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_log_f32_e64 v0, |v0| +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -655,7 +758,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX8-LABEL: v_pow_f32_fabs_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_log_f32_e64 v0, |v0| +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -663,7 +774,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX9-LABEL: v_pow_f32_fabs_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_log_f32_e64 v0, |v0| +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -672,7 +791,12 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_log_f32_e64 v0, |v0| +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -681,8 +805,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 @@ -696,7 +827,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6-LABEL: v_pow_f32_fabs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -704,7 +843,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX8-LABEL: v_pow_f32_fabs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -712,7 +859,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX9-LABEL: v_pow_f32_fabs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -721,7 +876,12 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -730,10 +890,16 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) @@ -745,7 +911,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_log_f32_e64 v0, |v0| +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -753,7 +927,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_log_f32_e64 v0, |v0| +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -761,7 +943,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_log_f32_e64 v0, |v0| +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -770,7 +960,12 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_log_f32_e64 v0, |v0| +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -779,8 +974,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 @@ -794,36 +996,72 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-LABEL: v_pow_f32_sgpr_vgpr: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_log_f32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_vgpr: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_log_f32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX8-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX8-NEXT: v_log_f32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_log_f32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX9-NEXT: v_log_f32_e32 v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_log_f32_e32 v1, s0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 +; GFX10-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX10-NEXT: v_log_f32_e32 v1, v1 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_log_f32_e32 v1, s0 +; GFX11-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 +; GFX11-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 @@ -835,38 +1073,73 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_vgpr_sgpr: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_vgpr_sgpr: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_vgpr_sgpr: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_vgpr_sgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_vgpr_sgpr: ; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) @@ -876,36 +1149,72 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_sgpr_sgpr: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_log_f32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_sgpr: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_log_f32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_log_f32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_log_f32_e32 v0, s0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_log_f32_e32 v0, s0 +; GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 +; GFX11-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 @@ -918,7 +1227,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6-LABEL: v_pow_f32_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_log_f32_e64 v0, -v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -926,7 +1243,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX8-LABEL: v_pow_f32_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_log_f32_e64 v0, -v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -934,7 +1259,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX9-LABEL: v_pow_f32_fneg_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_log_f32_e64 v0, -v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -943,7 +1276,12 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_log_f32_e64 v0, -v0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, -v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -952,8 +1290,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f32_e64 v0, -v0 +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 @@ -967,7 +1312,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6-LABEL: v_pow_f32_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -975,7 +1328,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX8-LABEL: v_pow_f32_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -983,7 +1344,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX9-LABEL: v_pow_f32_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -992,7 +1361,12 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1001,10 +1375,16 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog.mir index 9943af37a8d39..9bc02a199f409 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog.mir @@ -11,10 +11,20 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[COPY]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FLOG %0 $vgpr0 = COPY %1 @@ -30,10 +40,20 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = nnan G_FLOG %0 $vgpr0 = COPY %1 @@ -50,12 +70,27 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FSUB1]], [[C5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FLOG %0 @@ -73,14 +108,34 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C]] - ; CHECK-NEXT: [[FLOG2_2:%[0-9]+]]:_(s32) = G_FLOG2 [[UV2]] - ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_2]], [[C]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FSUB1]], [[C5]] + ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] + ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL4]](s32) + ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT5]] + ; CHECK-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FSUB2]], [[C5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FLOG %0 @@ -99,9 +154,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[C]] ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -129,13 +184,13 @@ body: | ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FE62E4300000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C1]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[C1]] ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C1]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT1]](s32) + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[C1]] ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog10.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog10.mir index 879ea62d80e34..7feba0889bbfa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog10.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog10.mir @@ -11,10 +11,20 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[COPY]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FLOG10 %0 $vgpr0 = COPY %1 @@ -30,10 +40,20 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = nnan G_FLOG10 %0 $vgpr0 = COPY %1 @@ -50,12 +70,27 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FSUB1]], [[C5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FLOG10 %0 @@ -73,14 +108,34 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C]] - ; CHECK-NEXT: [[FLOG2_2:%[0-9]+]]:_(s32) = G_FLOG2 [[UV2]] - ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_2]], [[C]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FSUB]], [[C5]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FSUB1]], [[C5]] + ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] + ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL4]](s32) + ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT5]] + ; CHECK-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FSUB2]], [[C5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FLOG10 %0 @@ -99,9 +154,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[C]] ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -129,13 +184,13 @@ body: | ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FD3441360000000 - ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_]], [[C1]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[C1]] ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT1]] - ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FLOG2_1]], [[C1]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT1]](s32) + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[C1]] ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog2.mir index 42d2531f0301d..42135d4bca4a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-flog2.mir @@ -11,8 +11,18 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[COPY]] - ; CHECK-NEXT: $vgpr0 = COPY [[FLOG2_]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSUB]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FLOG2 %0 $vgpr0 = COPY %1 @@ -29,9 +39,24 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FLOG2_]](s32), [[FLOG2_1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FLOG2 %0 @@ -49,10 +74,30 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; CHECK-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; CHECK-NEXT: [[FLOG2_2:%[0-9]+]]:_(s32) = G_FLOG2 [[UV2]] - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FLOG2_]](s32), [[FLOG2_1]](s32), [[FLOG2_2]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT1]], [[SELECT3]] + ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] + ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; CHECK-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT5]] + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32), [[FSUB2]](s32) ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FLOG2 %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir index 1b8d5cb0b3e32..b592283dee139 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir @@ -15,18 +15,38 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[COPY]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[COPY1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) ; GFX9-LABEL: name: test_fpow_s32 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[COPY]] - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -47,12 +67,27 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[UV2]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] - ; GFX6-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[UV3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV2]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) + ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fpow_v2s32 @@ -62,12 +97,27 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[UV2]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] - ; GFX9-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[UV3]](s32) - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV2]](s32) + ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] + ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -89,15 +139,35 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) ; GFX6-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[UV3]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] - ; GFX6-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[UV4]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FLOG2_2:%[0-9]+]]:_(s32) = G_FLOG2 [[UV2]] - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_2]](s32), [[UV5]](s32) - ; GFX6-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT2]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV3]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) + ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] + ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] + ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT4]], [[SELECT5]] + ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) + ; GFX6-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT5]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fpow_v3s32 @@ -107,15 +177,35 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) - ; GFX9-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[UV]] - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[UV3]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] - ; GFX9-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[UV1]] - ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[UV4]](s32) - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX9-NEXT: [[FLOG2_2:%[0-9]+]]:_(s32) = G_FLOG2 [[UV2]] - ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_2]](s32), [[UV5]](s32) - ; GFX9-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT2]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] + ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) + ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] + ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT4]], [[SELECT5]] + ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) + ; GFX9-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT5]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 @@ -135,18 +225,38 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan nsz G_FLOG2 [[COPY]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[COPY1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[COPY]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan nsz G_FSUB [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) ; GFX9-LABEL: name: test_fpow_s32_flags ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan nsz G_FLOG2 [[COPY]] - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[COPY]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan nsz G_FSUB [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -169,9 +279,9 @@ body: | ; GFX6-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -221,15 +331,15 @@ body: | ; GFX6-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) ; GFX6-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX6-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = G_FLOG2 [[FPEXT2]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[FPEXT3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT2]](s32), [[FPEXT3]](s32) + ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) @@ -293,15 +403,15 @@ body: | ; GFX6-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan nsz G_FLOG2 [[FPEXT]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) ; GFX6-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX6-NEXT: [[FLOG2_1:%[0-9]+]]:_(s32) = nnan nsz G_FLOG2 [[FPEXT2]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_1]](s32), [[FPEXT3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT2]](s32), [[FPEXT3]](s32) + ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT3]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir index 08b0cb4312af0..2bb9f17566978 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir @@ -18,9 +18,9 @@ body: | ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[FPEXT]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[SITOFP]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -59,9 +59,19 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) - ; GFX6-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[SITOFP]](s32) + ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) ; GFX9-LABEL: name: test_fpowi_s32_s32_flags ; GFX9: liveins: $vgpr0, $vgpr1 @@ -69,9 +79,19 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) - ; GFX9-NEXT: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3810000000000000 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 3.200000e+01 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[SITOFP]](s32) + ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index db05403197217..d5d6590e2c83b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -54,8 +54,16 @@ define float @v_powi_f32(float %l, i32 %r) { ; GFX78-LABEL: v_powi_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX78-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -64,11 +72,17 @@ define float @v_powi_f32(float %l, i32 %r) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 %r) @@ -111,7 +125,15 @@ define float @v_powi_neg1_f32(float %l) { ; GFX78-LABEL: v_powi_neg1_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -120,10 +142,16 @@ define float @v_powi_neg1_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -1) @@ -134,7 +162,15 @@ define float @v_powi_2_f32(float %l) { ; GFX78-LABEL: v_powi_2_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -143,10 +179,16 @@ define float @v_powi_2_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 2) @@ -157,7 +199,15 @@ define float @v_powi_neg2_f32(float %l) { ; GFX78-LABEL: v_powi_neg2_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -166,10 +216,16 @@ define float @v_powi_neg2_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -2) @@ -180,7 +236,15 @@ define float @v_powi_4_f32(float %l) { ; GFX78-LABEL: v_powi_4_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -189,10 +253,16 @@ define float @v_powi_4_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 4) @@ -203,7 +273,15 @@ define float @v_powi_8_f32(float %l) { ; GFX78-LABEL: v_powi_8_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -212,10 +290,16 @@ define float @v_powi_8_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 8) @@ -226,7 +310,15 @@ define float @v_powi_16_f32(float %l) { ; GFX78-LABEL: v_powi_16_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -235,10 +327,16 @@ define float @v_powi_16_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 16) @@ -249,7 +347,15 @@ define float @v_powi_128_f32(float %l) { ; GFX78-LABEL: v_powi_128_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -258,10 +364,16 @@ define float @v_powi_128_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 128) @@ -272,7 +384,15 @@ define float @v_powi_neg128_f32(float %l) { ; GFX78-LABEL: v_powi_neg128_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] @@ -281,10 +401,16 @@ define float @v_powi_neg128_f32(float %l) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -128) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index e03824d3d31e0..2ac3110628bd5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -12,49 +12,121 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { -; SI-LABEL: s_log_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_log_f32_e32 v0, s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: s_log_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_log_f32_e32 v0, s2 -; VI-NEXT: v_mul_f32_e32 v2, 0x3f317218, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; SI-SDAG-LABEL: s_log_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -64,9 +136,16 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm @@ -76,10 +155,16 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -87,28 +172,44 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; R600-LABEL: s_log_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[2].Z, -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.W, KC0[2].Z, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: ADD * T0.W, PS, -T0.W, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1060205080(6.931472e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[2].Z, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[2].Z, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, T0.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].Z, PV.W, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD * T0.W, PV.X, -T0.Z, +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -123,40 +224,81 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s3 -; SI-SDAG-NEXT: v_log_f32_e32 v2, s2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s3 -; VI-SDAG-NEXT: v_log_f32_e32 v2, s2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v2, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -164,37 +306,80 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, s2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v2 -; GFX900-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -202,10 +387,21 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, s2 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s3, v2 :: v_dual_mul_f32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v3, v1 :: v_dual_mov_b32 v3, 0 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v1, 0x3f317218, v0 :: v_dual_mul_f32 v0, 0x3f317218, v2 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,11 +410,22 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3f317218, v0 :: v_dual_mul_f32 v1, 0x3f317218, v1 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -226,37 +433,66 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; R600-LABEL: s_log_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].X, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[2].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].X, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[2].W, PS, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: CNDE T1.Z, T1.W, 0.0, literal.x, +; R600-NEXT: ADD T0.W, PS, -PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.Z, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: ADD * T0.W, PS, -PV.Z, ; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1060205080(6.931472e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[3].X, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].X, -; CM-NEXT: MUL_IEEE * T0.Y, PV.X, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].X, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[2].W, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].X, PV.Z, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.Y, T1.W, 0.0, literal.x, +; CM-NEXT: ADD T0.Z, PV.X, -T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].W, T0.Y, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T0.Y, T0.Z, literal.x, +; CM-NEXT: ADD * T0.W, PV.X, -T1.Y, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[2].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[2].W, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -269,16 +505,34 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v2, s4 -; SI-SDAG-NEXT: v_log_f32_e32 v3, s6 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v3 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v5, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm @@ -287,14 +541,35 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 @@ -303,15 +578,33 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v3, s4 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -320,12 +613,33 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -335,31 +649,70 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, s4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v3 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v4 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v7, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v3 -; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_v3f32: @@ -368,13 +721,30 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, s4 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, s6, v3 :: v_dual_mul_f32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3f317218, v0 :: v_dual_mul_f32 v1, 0x3f317218, v1 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v3 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v4, v1 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v5, v2 :: v_dual_mul_f32 v2, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v1, 0x3f317218, v1 :: v_dual_mul_f32 v0, 0x3f317218, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm @@ -384,13 +754,31 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3f317218, v0 :: v_dual_mul_f32 v1, 0x3f317218, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_sub_f32 v1, v1, v4 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v1, 0x3f317218, v1 :: v_dual_mul_f32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -398,53 +786,91 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; R600-LABEL: s_log_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 +; R600-NEXT: ALU 33, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Z, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].Z, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].Y, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].Z, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, KC0[3].Y, PS, +; R600-NEXT: CNDE T0.Z, T0.W, 0.0, literal.x, +; R600-NEXT: SETGT T0.W, literal.y, KC0[3].W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 8388608(1.175494e-38) +; R600-NEXT: CNDE T1.Y, T1.W, 0.0, literal.x, +; R600-NEXT: CNDE T1.Z, PV.W, 1.0, literal.y, +; R600-NEXT: ADD T1.W, PS, -PV.Z, +; R600-NEXT: LOG_IEEE * T0.X, PV.Y, +; R600-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: MUL_IEEE T1.W, KC0[3].W, PV.Z, +; R600-NEXT: ADD * T2.W, PS, -PV.Y, ; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; R600-NEXT: 1060205080(6.931472e-01), 2(2.802597e-45) -; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; R600-NEXT: LSHR T2.X, PV.W, literal.x, -; R600-NEXT: LOG_IEEE * T0.Z, KC0[3].W, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.Z, PV.W, +; R600-NEXT: 1060205080(6.931472e-01), 1107296256(3.200000e+01) +; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; R600-NEXT: ADD * T0.W, PS, -PV.W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PV.W, literal.x, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; R600-NEXT: 1060205080(6.931472e-01), 8(1.121039e-44) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE * T3.X, PS, literal.x, -; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) ; ; CM-LABEL: s_log_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR * T0.X, PV.W, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Y, KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Z, KC0[3].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].W, -; CM-NEXT: MUL_IEEE T1.X, PV.Z, literal.x, -; CM-NEXT: MUL_IEEE * T2.Y, T0.Y, literal.x, -; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Y, KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Y, -; CM-NEXT: MUL_IEEE * T2.X, PV.Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].W, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Y, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T1.Y, PV.Z, 1.0, literal.y, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].W, PV.Y, +; CM-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.X, T1.W, 1.0, literal.x, +; CM-NEXT: CNDE T2.Y, T0.Z, 0.0, literal.y, +; CM-NEXT: ADD T0.Z, PV.Y, -T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Z, T1.Y, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T2.X, T0.Z, literal.x, +; CM-NEXT: ADD T0.Y, PV.Y, -T2.Y, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, T1.X, +; CM-NEXT: 1060205080(6.931472e-01), 8(1.121039e-44) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W, T0.W, +; CM-NEXT: LSHR T1.X, T0.Z, literal.x, +; CM-NEXT: MUL_IEEE T0.Y, T0.Y, literal.y, +; CM-NEXT: ADD * T0.W, PV.W, -T0.X, +; CM-NEXT: 2(2.802597e-45), 1060205080(6.931472e-01) +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -460,17 +886,40 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s7 -; SI-SDAG-NEXT: v_log_f32_e32 v1, s6 -; SI-SDAG-NEXT: v_log_f32_e32 v4, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, s4 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -478,34 +927,83 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; SI-GISEL-NEXT: v_log_f32_e32 v3, s7 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s7 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v4, s5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, s4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v7, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -514,13 +1012,39 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_log_f32_e32 v3, s7 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 @@ -532,33 +1056,82 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v5, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v6, s4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v6 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-SDAG-NEXT: global_store_dwordx4 v7, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -568,12 +1141,34 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s7 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, s4 -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mul_f32 v3, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s7, v4 :: v_dual_mul_f32 v5, s6, v5 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s9 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v4, v0 :: v_dual_sub_f32 v1, v5, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v4, v6, v2 :: v_dual_sub_f32 v5, v7, v3 +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mul_f32 v3, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3f317218, v1 :: v_dual_mul_f32 v1, 0x3f317218, v4 ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v5 ; GFX1100-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] @@ -585,15 +1180,36 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3f317218, v0 :: v_dual_mul_f32 v1, 0x3f317218, v1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3f317218, v0 :: v_dual_mul_f32 v1, 0x3f317218, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3f317218, v2 :: v_dual_mul_f32 v3, 0x3f317218, v3 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -601,57 +1217,108 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; R600-LABEL: s_log_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 38, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[4].X, -; R600-NEXT: MUL_IEEE T0.W, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].W, -; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Z, +; R600-NEXT: SETGT T0.W, literal.x, KC0[4].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[4].X, PV.W, +; R600-NEXT: SETGT T2.W, literal.x, KC0[3].Z, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.y, +; R600-NEXT: 8388608(1.175494e-38), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T0.X, KC0[3].W, PS, +; R600-NEXT: SETGT T0.Y, literal.x, KC0[3].Y, +; R600-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; R600-NEXT: CNDE T0.W, PV.W, 1.0, literal.z, +; R600-NEXT: LOG_IEEE * T0.Z, PV.Z, +; R600-NEXT: 8388608(1.175494e-38), 1107296256(3.200000e+01) +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, KC0[3].Z, PV.W, +; R600-NEXT: ADD T1.Y, PS, -PV.Z, +; R600-NEXT: CNDE T0.Z, T1.W, 0.0, literal.x, +; R600-NEXT: CNDE T0.W, PV.Y, 1.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T2.X, KC0[3].Y, PV.W, +; R600-NEXT: CNDE T2.Y, T2.W, 0.0, literal.x, +; R600-NEXT: ADD T0.Z, PS, -PV.Z, +; R600-NEXT: MUL_IEEE T0.W, PV.Y, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1060205080(6.931472e-01) +; R600-NEXT: CNDE T1.Y, T0.Y, 0.0, literal.x, +; R600-NEXT: MUL_IEEE T0.Z, PV.Z, literal.y, +; R600-NEXT: ADD T1.W, PS, -PV.Y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1060205080(6.931472e-01) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: ADD * T1.W, PS, -PV.Y, ; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, -; R600-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1060205080(6.931472e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log_v4f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X +; CM-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[4].X, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[4].X, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, -; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].W, -; CM-NEXT: MUL_IEEE * T0.Z, PV.X, literal.x, +; CM-NEXT: SETGT T0.Z, literal.x, KC0[4].X, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].Y, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, PV.Z, 1.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].W, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T1.Y, literal.y, KC0[3].Z, +; CM-NEXT: CNDE T0.Z, T0.Z, 0.0, literal.z, +; CM-NEXT: MUL_IEEE * T2.W, KC0[4].X, PV.Z, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T1.X, T2.W, +; CM-NEXT: LOG_IEEE T1.Y (MASKED), T2.W, +; CM-NEXT: LOG_IEEE T1.Z (MASKED), T2.W, +; CM-NEXT: LOG_IEEE * T1.W (MASKED), T2.W, +; CM-NEXT: ADD T1.X, PV.X, -T0.Z, +; CM-NEXT: CNDE T2.Y, T1.Y, 1.0, literal.x, +; CM-NEXT: CNDE T0.Z, T1.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T1.W, KC0[3].W, T0.X, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: ADD T0.X, PV.X, -T0.Z, +; CM-NEXT: CNDE T1.Y, T1.Y, 0.0, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, KC0[3].Z, T2.Y, +; CM-NEXT: MUL_IEEE * T1.W, T1.X, literal.y, +; CM-NEXT: 1107296256(3.200000e+01), 1060205080(6.931472e-01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE T0.Z, T0.Z, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.Z, +; CM-NEXT: CNDE T1.X, T0.W, 0.0, literal.x, +; CM-NEXT: ADD T1.Y, PV.Z, -T1.Y, +; CM-NEXT: MUL_IEEE T1.Z, T0.X, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, T0.Y, BS:VEC_021/SCL_122 +; CM-NEXT: 1107296256(3.200000e+01), 1060205080(6.931472e-01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T1.Y, T1.Y, literal.x, +; CM-NEXT: ADD * T0.W, PV.X, -T1.X, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Z, -; CM-NEXT: MUL_IEEE * T0.Y, PV.X, literal.x, +; CM-NEXT: MUL_IEEE * T1.X, PV.W, literal.x, ; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Y, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, -; CM-NEXT: 1060205080(6.931472e-01), 0(0.000000e+00) -; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <4 x float> @llvm.log.v4f32(<4 x float> %in) store <4 x float> %result, ptr addrspace(1) %out @@ -659,21 +1326,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { } define float @v_log_f32(float %in) { -; GFX689-LABEL: v_log_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32: ; R600: ; %bb.0: @@ -689,21 +1402,67 @@ define float @v_log_f32(float %in) { } define float @v_log_fabs_f32(float %in) { -; GFX689-LABEL: v_log_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_fabs_f32: ; R600: ; %bb.0: @@ -720,21 +1479,67 @@ define float @v_log_fabs_f32(float %in) { } define float @v_log_fneg_fabs_f32(float %in) { -; GFX689-LABEL: v_log_fneg_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -|v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_fneg_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_fneg_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -|v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_fneg_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_fneg_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_fneg_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_fneg_fabs_f32: ; R600: ; %bb.0: @@ -752,30 +1557,76 @@ define float @v_log_fneg_fabs_f32(float %in) { } define float @v_log_fneg_f32(float %in) { -; GFX689-LABEL: v_log_fneg_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: v_log_fneg_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_fneg_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; R600-LABEL: v_log_fneg_f32: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD +; GFX689-GISEL-LABEL: v_log_fneg_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; CM-LABEL: v_log_fneg_f32: -; CM: ; %bb.0: -; CM-NEXT: CF_END +; GFX1100-SDAG-LABEL: v_log_fneg_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_fneg_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_log_fneg_f32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD +; +; CM-LABEL: v_log_fneg_f32: +; CM: ; %bb.0: +; CM-NEXT: CF_END ; CM-NEXT: PAD %fneg = fneg float %in %result = call float @llvm.log.f32(float %fneg) @@ -783,21 +1634,52 @@ define float @v_log_fneg_f32(float %in) { } define float @v_log_f32_fast(float %in) { -; GFX689-LABEL: v_log_f32_fast: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_fast: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_fast: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_fast: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_fast: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_fast: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_fast: ; R600: ; %bb.0: @@ -873,21 +1755,67 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { } define float @v_log_f32_ninf(float %in) { -; GFX689-LABEL: v_log_f32_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_ninf: ; R600: ; %bb.0: @@ -903,21 +1831,52 @@ define float @v_log_f32_ninf(float %in) { } define float @v_log_f32_afn(float %in) { -; GFX689-LABEL: v_log_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_afn: ; R600: ; %bb.0: @@ -963,21 +1922,52 @@ define float @v_log_f32_afn_daz(float %in) #0 { } define float @v_log_f32_afn_dynamic(float %in) #1 { -; GFX689-LABEL: v_log_f32_afn_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_afn_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_afn_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_afn_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_afn_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_afn_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_afn_dynamic: ; R600: ; %bb.0: @@ -993,21 +1983,52 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { } define float @v_fabs_log_f32_afn(float %in) { -; GFX689-LABEL: v_fabs_log_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_fabs_log_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_fabs_log_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_fabs_log_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e64 v0, |v0| +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_fabs_log_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_fabs_log_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e64 v0, |v0| +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fabs_log_f32_afn: ; R600: ; %bb.0: @@ -1054,21 +2075,67 @@ define float @v_log_f32_daz(float %in) #0 { } define float @v_log_f32_nnan(float %in) { -; GFX689-LABEL: v_log_f32_nnan: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_nnan: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_nnan: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_nnan: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_nnan: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_nnan: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_nnan: ; R600: ; %bb.0: @@ -1114,21 +2181,67 @@ define float @v_log_f32_nnan_daz(float %in) #0 { } define float @v_log_f32_nnan_dynamic(float %in) #1 { -; GFX689-LABEL: v_log_f32_nnan_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_nnan_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_nnan_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_nnan_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_nnan_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_nnan_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_nnan_dynamic: ; R600: ; %bb.0: @@ -1174,21 +2287,67 @@ define float @v_log_f32_ninf_daz(float %in) #0 { } define float @v_log_f32_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log_f32_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_ninf_dynamic: ; R600: ; %bb.0: @@ -1204,21 +2363,67 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { } define float @v_log_f32_nnan_ninf(float %in) { -; GFX689-LABEL: v_log_f32_nnan_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_nnan_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_nnan_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_nnan_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_nnan_ninf: ; R600: ; %bb.0: @@ -1264,21 +2469,67 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 { } define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log_f32_nnan_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_nnan_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: @@ -1324,21 +2575,67 @@ define float @v_log_f32_fast_daz(float %in) #0 { } define float @v_log_f32_dynamic_mode(float %in) #1 { -; GFX689-LABEL: v_log_f32_dynamic_mode: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_dynamic_mode: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_dynamic_mode: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_dynamic_mode: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_dynamic_mode: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_dynamic_mode: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_dynamic_mode: ; R600: ; %bb.0: @@ -1354,21 +2651,54 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { } define float @v_log_f32_undef() { -; GFX689-LABEL: v_log_f32_undef: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, s4 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log_f32_undef: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_undef: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log_f32_undef: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e64 v2, s4, 1.0 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log_f32_undef: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log_f32_undef: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v1, s0, 1.0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_undef: ; R600: ; %bb.0: @@ -1388,14 +2718,24 @@ define float @v_log_f32_0() { ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, 0 +; GFX689-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 ; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log_f32_0: ; GFX689-GISEL: ; %bb.0: ; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x3f317218 -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0xff800000, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e64 v2, 0, 1.0 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-LABEL: v_log_f32_0: @@ -1404,6 +2744,8 @@ define float @v_log_f32_0() { ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, 0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1411,9 +2753,17 @@ define float @v_log_f32_0() { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0x3f317218 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0xff800000, v0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, 0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v1, 0, 1.0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, 0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_0: @@ -1470,8 +2820,16 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1538,7 +2896,15 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX689-SDAG-LABEL: v_log_f32_from_fpext_bf16: ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,8 +2920,15 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2923,3 +4296,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index a813ae1d947f4..16f72de763fee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -12,49 +12,121 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { -; SI-LABEL: s_log10_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_log_f32_e32 v0, s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: s_log10_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_log_f32_e32 v0, s2 -; VI-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; SI-SDAG-LABEL: s_log10_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log10_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log10_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log10_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -64,9 +136,16 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm @@ -76,10 +155,16 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -87,28 +172,44 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; R600-LABEL: s_log10_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[2].Z, -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.W, KC0[2].Z, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: ADD * T0.W, PS, -T0.W, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1050288283(3.010300e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log10_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[2].Z, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[2].Z, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, T0.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].Z, PV.W, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD * T0.W, PV.X, -T0.Z, +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -123,40 +224,81 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s3 -; SI-SDAG-NEXT: v_log_f32_e32 v2, s2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s3 -; VI-SDAG-NEXT: v_log_f32_e32 v2, s2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v2, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -164,37 +306,80 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, s2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v2 -; GFX900-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v3 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -202,10 +387,21 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, s2 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s3, v2 :: v_dual_mul_f32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v3, v1 :: v_dual_mov_b32 v3, 0 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v1, 0x3e9a209b, v0 :: v_dual_mul_f32 v0, 0x3e9a209b, v2 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,11 +410,22 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3e9a209b, v0 :: v_dual_mul_f32 v1, 0x3e9a209b, v1 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -226,37 +433,66 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; R600-LABEL: s_log10_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].X, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[2].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].X, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[2].W, PS, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: CNDE T1.Z, T1.W, 0.0, literal.x, +; R600-NEXT: ADD T0.W, PS, -PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.Z, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: ADD * T0.W, PS, -PV.Z, ; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1050288283(3.010300e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log10_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[3].X, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].X, -; CM-NEXT: MUL_IEEE * T0.Y, PV.X, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].X, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[2].W, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].X, PV.Z, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.Y, T1.W, 0.0, literal.x, +; CM-NEXT: ADD T0.Z, PV.X, -T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].W, T0.Y, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T0.Y, T0.Z, literal.x, +; CM-NEXT: ADD * T0.W, PV.X, -T1.Y, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[2].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[2].W, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -269,16 +505,34 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v2, s4 -; SI-SDAG-NEXT: v_log_f32_e32 v3, s6 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v3 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v5, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm @@ -287,14 +541,35 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 @@ -303,15 +578,33 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v3, s4 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -320,12 +613,33 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -335,31 +649,70 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, s4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v3 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v4 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v7, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v3 -; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v5 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_v3f32: @@ -368,13 +721,30 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, s4 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, s6, v3 :: v_dual_mul_f32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3e9a209b, v0 :: v_dual_mul_f32 v1, 0x3e9a209b, v1 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v3 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v4, v1 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v5, v2 :: v_dual_mul_f32 v2, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v1, 0x3e9a209b, v1 :: v_dual_mul_f32 v0, 0x3e9a209b, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm @@ -384,13 +754,31 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3e9a209b, v0 :: v_dual_mul_f32 v1, 0x3e9a209b, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_sub_f32 v1, v1, v4 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v1, 0x3e9a209b, v1 :: v_dual_mul_f32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -398,53 +786,91 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; R600-LABEL: s_log10_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 +; R600-NEXT: ALU 33, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Z, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].Z, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].Y, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].Z, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, KC0[3].Y, PS, +; R600-NEXT: CNDE T0.Z, T0.W, 0.0, literal.x, +; R600-NEXT: SETGT T0.W, literal.y, KC0[3].W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 8388608(1.175494e-38) +; R600-NEXT: CNDE T1.Y, T1.W, 0.0, literal.x, +; R600-NEXT: CNDE T1.Z, PV.W, 1.0, literal.y, +; R600-NEXT: ADD T1.W, PS, -PV.Z, +; R600-NEXT: LOG_IEEE * T0.X, PV.Y, +; R600-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: MUL_IEEE T1.W, KC0[3].W, PV.Z, +; R600-NEXT: ADD * T2.W, PS, -PV.Y, ; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; R600-NEXT: 1050288283(3.010300e-01), 2(2.802597e-45) -; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; R600-NEXT: LSHR T2.X, PV.W, literal.x, -; R600-NEXT: LOG_IEEE * T0.Z, KC0[3].W, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.Z, PV.W, +; R600-NEXT: 1050288283(3.010300e-01), 1107296256(3.200000e+01) +; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; R600-NEXT: ADD * T0.W, PS, -PV.W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PV.W, literal.x, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; R600-NEXT: 1050288283(3.010300e-01), 8(1.121039e-44) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE * T3.X, PS, literal.x, -; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) ; ; CM-LABEL: s_log10_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR * T0.X, PV.W, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Y, KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Z, KC0[3].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].W, -; CM-NEXT: MUL_IEEE T1.X, PV.Z, literal.x, -; CM-NEXT: MUL_IEEE * T2.Y, T0.Y, literal.x, -; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Y, KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Y, -; CM-NEXT: MUL_IEEE * T2.X, PV.Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].W, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Y, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T1.Y, PV.Z, 1.0, literal.y, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].W, PV.Y, +; CM-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.X, T1.W, 1.0, literal.x, +; CM-NEXT: CNDE T2.Y, T0.Z, 0.0, literal.y, +; CM-NEXT: ADD T0.Z, PV.Y, -T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Z, T1.Y, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T2.X, T0.Z, literal.x, +; CM-NEXT: ADD T0.Y, PV.Y, -T2.Y, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, T1.X, +; CM-NEXT: 1050288283(3.010300e-01), 8(1.121039e-44) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W, T0.W, +; CM-NEXT: LSHR T1.X, T0.Z, literal.x, +; CM-NEXT: MUL_IEEE T0.Y, T0.Y, literal.y, +; CM-NEXT: ADD * T0.W, PV.W, -T0.X, +; CM-NEXT: 2(2.802597e-45), 1050288283(3.010300e-01) +; CM-NEXT: MUL_IEEE * T0.X, PV.W, literal.x, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -460,17 +886,40 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4f800000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s7 -; SI-SDAG-NEXT: v_log_f32_e32 v1, s6 -; SI-SDAG-NEXT: v_log_f32_e32 v4, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, s4 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -478,34 +927,83 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; SI-GISEL-NEXT: v_log_f32_e32 v3, s7 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s7 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v4, s5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, s4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v7, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -514,13 +1012,39 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_log_f32_e32 v3, s7 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 @@ -532,33 +1056,82 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v5, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v6, s4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v6 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-SDAG-NEXT: global_store_dwordx4 v7, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v5 +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v5, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v6 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -568,12 +1141,34 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s7 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, s4 -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mul_f32 v3, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s7, v4 :: v_dual_mul_f32 v5, s6, v5 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s9 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v4, v0 :: v_dual_sub_f32 v1, v5, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v4, v6, v2 :: v_dual_sub_f32 v5, v7, v3 +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mul_f32 v3, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3e9a209b, v1 :: v_dual_mul_f32 v1, 0x3e9a209b, v4 ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v5 ; GFX1100-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] @@ -585,15 +1180,36 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3e9a209b, v0 :: v_dual_mul_f32 v1, 0x3e9a209b, v1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, 0x3e9a209b, v0 :: v_dual_mul_f32 v1, 0x3e9a209b, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3e9a209b, v2 :: v_dual_mul_f32 v3, 0x3e9a209b, v3 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -601,57 +1217,108 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; R600-LABEL: s_log10_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 38, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.X, KC0[4].X, -; R600-NEXT: MUL_IEEE T0.W, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].W, -; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Z, +; R600-NEXT: SETGT T0.W, literal.x, KC0[4].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[4].X, PV.W, +; R600-NEXT: SETGT T2.W, literal.x, KC0[3].Z, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.y, +; R600-NEXT: 8388608(1.175494e-38), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T0.X, KC0[3].W, PS, +; R600-NEXT: SETGT T0.Y, literal.x, KC0[3].Y, +; R600-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; R600-NEXT: CNDE T0.W, PV.W, 1.0, literal.z, +; R600-NEXT: LOG_IEEE * T0.Z, PV.Z, +; R600-NEXT: 8388608(1.175494e-38), 1107296256(3.200000e+01) +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, KC0[3].Z, PV.W, +; R600-NEXT: ADD T1.Y, PS, -PV.Z, +; R600-NEXT: CNDE T0.Z, T1.W, 0.0, literal.x, +; R600-NEXT: CNDE T0.W, PV.Y, 1.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T2.X, KC0[3].Y, PV.W, +; R600-NEXT: CNDE T2.Y, T2.W, 0.0, literal.x, +; R600-NEXT: ADD T0.Z, PS, -PV.Z, +; R600-NEXT: MUL_IEEE T0.W, PV.Y, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1050288283(3.010300e-01) +; R600-NEXT: CNDE T1.Y, T0.Y, 0.0, literal.x, +; R600-NEXT: MUL_IEEE T0.Z, PV.Z, literal.y, +; R600-NEXT: ADD T1.W, PS, -PV.Y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1050288283(3.010300e-01) +; R600-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; R600-NEXT: ADD * T1.W, PS, -PV.Y, ; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, -; R600-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; R600-NEXT: 1050288283(3.010300e-01), 2(2.802597e-45) ; ; CM-LABEL: s_log10_v4f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X +; CM-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LOG_IEEE T0.X, KC0[4].X, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[4].X, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, -; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].W, -; CM-NEXT: MUL_IEEE * T0.Z, PV.X, literal.x, +; CM-NEXT: SETGT T0.Z, literal.x, KC0[4].X, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].Y, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, PV.Z, 1.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].W, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T1.Y, literal.y, KC0[3].Z, +; CM-NEXT: CNDE T0.Z, T0.Z, 0.0, literal.z, +; CM-NEXT: MUL_IEEE * T2.W, KC0[4].X, PV.Z, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T1.X, T2.W, +; CM-NEXT: LOG_IEEE T1.Y (MASKED), T2.W, +; CM-NEXT: LOG_IEEE T1.Z (MASKED), T2.W, +; CM-NEXT: LOG_IEEE * T1.W (MASKED), T2.W, +; CM-NEXT: ADD T1.X, PV.X, -T0.Z, +; CM-NEXT: CNDE T2.Y, T1.Y, 1.0, literal.x, +; CM-NEXT: CNDE T0.Z, T1.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T1.W, KC0[3].W, T0.X, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: ADD T0.X, PV.X, -T0.Z, +; CM-NEXT: CNDE T1.Y, T1.Y, 0.0, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, KC0[3].Z, T2.Y, +; CM-NEXT: MUL_IEEE * T1.W, T1.X, literal.y, +; CM-NEXT: 1107296256(3.200000e+01), 1050288283(3.010300e-01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE T0.Z, T0.Z, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.Z, +; CM-NEXT: CNDE T1.X, T0.W, 0.0, literal.x, +; CM-NEXT: ADD T1.Y, PV.Z, -T1.Y, +; CM-NEXT: MUL_IEEE T1.Z, T0.X, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, T0.Y, BS:VEC_021/SCL_122 +; CM-NEXT: 1107296256(3.200000e+01), 1050288283(3.010300e-01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T1.Y, T1.Y, literal.x, +; CM-NEXT: ADD * T0.W, PV.X, -T1.X, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Z, -; CM-NEXT: MUL_IEEE * T0.Y, PV.X, literal.x, +; CM-NEXT: MUL_IEEE * T1.X, PV.W, literal.x, ; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T0.X, KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Y (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T0.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T0.W (MASKED), KC0[3].Y, -; CM-NEXT: MUL_IEEE * T0.X, PV.X, literal.x, -; CM-NEXT: 1050288283(3.010300e-01), 0(0.000000e+00) -; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <4 x float> @llvm.log10.v4f32(<4 x float> %in) store <4 x float> %result, ptr addrspace(1) %out @@ -659,21 +1326,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) } define float @v_log10_f32(float %in) { -; GFX689-LABEL: v_log10_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32: ; R600: ; %bb.0: @@ -689,21 +1402,67 @@ define float @v_log10_f32(float %in) { } define float @v_log10_fabs_f32(float %in) { -; GFX689-LABEL: v_log10_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_fabs_f32: ; R600: ; %bb.0: @@ -720,21 +1479,67 @@ define float @v_log10_fabs_f32(float %in) { } define float @v_log10_fneg_fabs_f32(float %in) { -; GFX689-LABEL: v_log10_fneg_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -|v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_fneg_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_fneg_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -|v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_fneg_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_fneg_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_fneg_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_fneg_fabs_f32: ; R600: ; %bb.0: @@ -752,30 +1557,76 @@ define float @v_log10_fneg_fabs_f32(float %in) { } define float @v_log10_fneg_f32(float %in) { -; GFX689-LABEL: v_log10_fneg_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: v_log10_fneg_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_fneg_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; R600-LABEL: v_log10_fneg_f32: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD +; GFX689-GISEL-LABEL: v_log10_fneg_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; CM-LABEL: v_log10_fneg_f32: -; CM: ; %bb.0: -; CM-NEXT: CF_END +; GFX1100-SDAG-LABEL: v_log10_fneg_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_fneg_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_log10_fneg_f32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD +; +; CM-LABEL: v_log10_fneg_f32: +; CM: ; %bb.0: +; CM-NEXT: CF_END ; CM-NEXT: PAD %fneg = fneg float %in %result = call float @llvm.log10.f32(float %fneg) @@ -783,21 +1634,52 @@ define float @v_log10_fneg_f32(float %in) { } define float @v_log10_f32_fast(float %in) { -; GFX689-LABEL: v_log10_f32_fast: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_fast: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_fast: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_fast: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_fast: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_fast: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_fast: ; R600: ; %bb.0: @@ -873,21 +1755,67 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" } define float @v_log10_f32_ninf(float %in) { -; GFX689-LABEL: v_log10_f32_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_ninf: ; R600: ; %bb.0: @@ -903,21 +1831,52 @@ define float @v_log10_f32_ninf(float %in) { } define float @v_log10_f32_afn(float %in) { -; GFX689-LABEL: v_log10_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_afn: ; R600: ; %bb.0: @@ -963,21 +1922,52 @@ define float @v_log10_f32_afn_daz(float %in) #0 { } define float @v_log10_f32_afn_dynamic(float %in) #1 { -; GFX689-LABEL: v_log10_f32_afn_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_afn_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_afn_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_afn_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_afn_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_afn_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_afn_dynamic: ; R600: ; %bb.0: @@ -993,21 +1983,52 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { } define float @v_fabs_log10_f32_afn(float %in) { -; GFX689-LABEL: v_fabs_log10_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_fabs_log10_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_fabs_log10_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_fabs_log10_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_log_f32_e64 v0, |v0| +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_fabs_log10_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_fabs_log10_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_log_f32_e64 v0, |v0| +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fabs_log10_f32_afn: ; R600: ; %bb.0: @@ -1054,21 +2075,67 @@ define float @v_log10_f32_daz(float %in) #0 { } define float @v_log10_f32_nnan(float %in) { -; GFX689-LABEL: v_log10_f32_nnan: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_nnan: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_nnan: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_nnan: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_nnan: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_nnan: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_nnan: ; R600: ; %bb.0: @@ -1114,21 +2181,67 @@ define float @v_log10_f32_nnan_daz(float %in) #0 { } define float @v_log10_f32_nnan_dynamic(float %in) #1 { -; GFX689-LABEL: v_log10_f32_nnan_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_nnan_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_nnan_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_nnan_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_nnan_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_nnan_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_nnan_dynamic: ; R600: ; %bb.0: @@ -1174,21 +2287,67 @@ define float @v_log10_f32_ninf_daz(float %in) #0 { } define float @v_log10_f32_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log10_f32_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_ninf_dynamic: ; R600: ; %bb.0: @@ -1204,21 +2363,67 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { } define float @v_log10_f32_nnan_ninf(float %in) { -; GFX689-LABEL: v_log10_f32_nnan_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_nnan_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_nnan_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_nnan_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_nnan_ninf: ; R600: ; %bb.0: @@ -1264,21 +2469,67 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 { } define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log10_f32_nnan_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_nnan_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: @@ -1324,21 +2575,67 @@ define float @v_log10_f32_fast_daz(float %in) #0 { } define float @v_log10_f32_dynamic_mode(float %in) #1 { -; GFX689-LABEL: v_log10_f32_dynamic_mode: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_dynamic_mode: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_dynamic_mode: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_dynamic_mode: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_dynamic_mode: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_dynamic_mode: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_dynamic_mode: ; R600: ; %bb.0: @@ -1354,21 +2651,54 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { } define float @v_log10_f32_undef() { -; GFX689-LABEL: v_log10_f32_undef: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, s4 -; GFX689-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log10_f32_undef: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_undef: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log10_f32_undef: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e64 v2, s4, 1.0 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log10_f32_undef: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log10_f32_undef: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v1, s0, 1.0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_undef: ; R600: ; %bb.0: @@ -1388,14 +2718,24 @@ define float @v_log10_f32_0() { ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, 0 +; GFX689-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 ; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log10_f32_0: ; GFX689-GISEL: ; %bb.0: ; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x3e9a209b -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0xff800000, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e64 v2, 0, 1.0 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-LABEL: v_log10_f32_0: @@ -1404,6 +2744,8 @@ define float @v_log10_f32_0() { ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, 0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1411,9 +2753,17 @@ define float @v_log10_f32_0() { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0x3e9a209b -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0xff800000, v0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, 0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v1, 0, 1.0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, 0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_0: @@ -1470,8 +2820,16 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1538,7 +2896,15 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX689-SDAG-LABEL: v_log10_f32_from_fpext_bf16: ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,8 +2920,15 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2923,3 +4296,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 9d00f4a06e4c2..41ee4f89c34c5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -12,74 +12,197 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { -; SI-LABEL: s_log2_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_log_f32_e32 v0, s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: s_log2_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_log_f32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX900-LABEL: s_log2_f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_log_f32_e32 v0, s4 -; GFX900-NEXT: global_store_dword v1, v0, s[2:3] -; GFX900-NEXT: s_endpgm -; -; GFX1100-LABEL: s_log2_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-NEXT: v_mov_b32_e32 v1, 0 -; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, s2 -; GFX1100-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1100-NEXT: s_endpgm +; SI-SDAG-LABEL: s_log2_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log2_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log2_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log2_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_log2_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_log2_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX900-GISEL-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: s_log2_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: s_log2_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; R600-NEXT: LOG_IEEE * T1.X, KC0[2].Z, +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.W, KC0[2].Z, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: ADD T0.X, PS, -T0.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_log2_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, T0.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].Z, PV.W, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD * T0.X, PV.X, -T0.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T1.X, KC0[2].Z, -; CM-NEXT: LOG_IEEE T1.Y (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[2].Z, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[2].Z, %result = call float @llvm.log2.f32(float %in) store float %result, ptr addrspace(1) %out ret void @@ -91,33 +214,74 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v1, s3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, s2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s3 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v1, s3 -; VI-SDAG-NEXT: v_log_f32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v2, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -125,86 +289,183 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s2 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s2 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s3 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s3 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s2 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 +; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.Y, KC0[3].X, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[2].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].X, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[2].W, PS, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Y, PS, -PV.W, +; R600-NEXT: CNDE T0.W, T1.W, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.Z, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PS, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_log2_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].X, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[2].W, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].X, PV.Z, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD T1.Y, PV.X, -T1.Z, +; CM-NEXT: CNDE T0.Z, T1.W, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].W, T0.Y, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD * T1.X, PV.X, -T0.Z, ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T1.X (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE T1.Y, KC0[3].X, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[3].X, -; CM-NEXT: LOG_IEEE T1.X, KC0[2].W, -; CM-NEXT: LOG_IEEE T1.Y (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[2].W, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[2].W, %result = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) store <2 x float> %result, ptr addrspace(1) %out ret void @@ -215,12 +476,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s4, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v2, s6 -; SI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v7 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v5 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm @@ -228,26 +507,65 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v2, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -255,13 +573,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; @@ -269,23 +608,62 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4 +; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -294,12 +672,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-SDAG-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 +; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -308,56 +704,114 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0 +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.Y, KC0[3].Z, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, -; R600-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; R600-NEXT: LSHR T2.X, PV.W, literal.x, -; R600-NEXT: LOG_IEEE * T3.X, KC0[3].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].Z, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].Y, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.W, KC0[3].Z, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, KC0[3].Y, PS, +; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].W, +; R600-NEXT: CNDE T0.W, T0.W, 0.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: 8388608(1.175494e-38), 1107296256(3.200000e+01) +; R600-NEXT: ADD T1.Y, PS, -PV.W, +; R600-NEXT: CNDE T1.Z, PV.Z, 1.0, literal.x, +; R600-NEXT: CNDE T0.W, T1.W, 0.0, literal.y, +; R600-NEXT: LOG_IEEE * T0.X, PV.Y, +; R600-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; R600-NEXT: ADD T1.X, PS, -PV.W, +; R600-NEXT: MUL_IEEE T0.W, KC0[3].W, PV.Z, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: CNDE T1.W, T0.Z, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.Y, PV.W, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T2.X, PS, -PV.W, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_log2_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3, T1.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X +; CM-NEXT: ALU 35, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR * T0.X, PV.W, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].W, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Y, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Y, PV.Z, 1.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].W, PV.Y, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD T1.X, PV.Y, -T1.Z, +; CM-NEXT: CNDE T0.Y, T0.Z, 0.0, literal.x, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Z, T1.Y, +; CM-NEXT: 1107296256(3.200000e+01), 8(1.121039e-44) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W, T0.W, +; CM-NEXT: LSHR T2.X, T0.Z, literal.x, +; CM-NEXT: ADD T0.Y, PV.W, -T0.Y, +; CM-NEXT: CNDE T0.Z, T1.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, T0.X, +; CM-NEXT: 2(2.802597e-45), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T0.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: ADD * T0.X, PV.X, -T0.Z, +; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T2.X, KC0[3].W, -; CM-NEXT: LOG_IEEE T2.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T2.Z (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE * T2.W (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T3.X (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T3.Y, KC0[3].Z, -; CM-NEXT: LOG_IEEE T3.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T3.W (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T3.X, KC0[3].Y, -; CM-NEXT: LOG_IEEE T3.Y (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T3.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T3.W (MASKED), KC0[3].Y, %result = call <3 x float> @llvm.log2.v3f32(<3 x float> %in) store <3 x float> %result, ptr addrspace(1) %out ret void @@ -368,42 +822,114 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v3, s7 -; SI-SDAG-NEXT: v_log_f32_e32 v2, s6 -; SI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 +; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; SI-GISEL-NEXT: v_log_f32_e32 v3, s7 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v3, s7 -; VI-SDAG-NEXT: v_log_f32_e32 v2, s6 -; VI-SDAG-NEXT: v_log_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v8, v8 +; VI-SDAG-NEXT: v_log_f32_e32 v9, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -411,14 +937,40 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_log_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_log_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_log_f32_e32 v3, s7 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; @@ -426,12 +978,35 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, s7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, s6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s7, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s6, v7 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v9, s5, v9 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_log_f32_e32 v9, v9 +; GFX900-SDAG-NEXT: v_log_f32_e32 v10, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -439,12 +1014,38 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -453,13 +1054,35 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, s7 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, s6 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 +; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -468,73 +1091,192 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, s6 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 33, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LOG_IEEE * T0.W, KC0[4].X, -; R600-NEXT: LOG_IEEE * T0.Z, KC0[3].W, -; R600-NEXT: LOG_IEEE * T0.Y, KC0[3].Z, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: LOG_IEEE * T0.X, KC0[3].Y, +; R600-NEXT: SETGT T0.W, literal.x, KC0[4].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].W, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Z, KC0[4].X, PV.W, +; R600-NEXT: SETGT T2.W, literal.x, KC0[3].Z, +; R600-NEXT: CNDE * T3.W, T1.W, 1.0, literal.y, +; R600-NEXT: 8388608(1.175494e-38), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T0.X, KC0[3].W, PS, +; R600-NEXT: CNDE T0.Y, T0.W, 0.0, literal.x, +; R600-NEXT: SETGT T1.Z, literal.y, KC0[3].Y, +; R600-NEXT: CNDE T0.W, PV.W, 1.0, literal.z, +; R600-NEXT: LOG_IEEE * T0.Z, PV.Z, +; R600-NEXT: 1107296256(3.200000e+01), 8388608(1.175494e-38) +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, KC0[3].Z, PV.W, +; R600-NEXT: CNDE T1.Y, T1.W, 0.0, literal.x, +; R600-NEXT: CNDE T2.Z, PV.Z, 1.0, literal.y, +; R600-NEXT: ADD T0.W, PS, -PV.Y, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 1333788672(4.294967e+09) +; R600-NEXT: MUL_IEEE T2.Y, KC0[3].Y, PV.Z, +; R600-NEXT: ADD T0.Z, PS, -PV.Y, +; R600-NEXT: CNDE T1.W, T2.W, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.X, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Y, PS, -PV.W, +; R600-NEXT: CNDE T1.W, T1.Z, 0.0, literal.x, +; R600-NEXT: LOG_IEEE * T0.X, PV.Y, +; R600-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PS, -PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_log2_v4f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X +; CM-NEXT: ALU 43, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[4].X, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 1.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].W, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Z, +; CM-NEXT: 1333788672(4.294967e+09), 8388608(1.175494e-38) +; CM-NEXT: CNDE T0.X, PV.W, 1.0, literal.x, +; CM-NEXT: CNDE T1.Y, T0.W, 0.0, literal.y, +; CM-NEXT: CNDE T1.Z, PV.Z, 1.0, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[4].X, PV.Y, +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: LOG_IEEE T0.Y, T0.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.X, T0.Z, 0.0, literal.x, +; CM-NEXT: SETGT T2.Y, literal.y, KC0[3].Y, +; CM-NEXT: MUL_IEEE T0.Z, KC0[3].W, T1.Z, +; CM-NEXT: ADD * T0.W, PV.Y, -T1.Y, +; CM-NEXT: 1107296256(3.200000e+01), 8388608(1.175494e-38) +; CM-NEXT: LOG_IEEE T0.X (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE T0.Y, T0.Z, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T0.Z, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T0.Z, +; CM-NEXT: CNDE T2.X, T2.Y, 1.0, literal.x, +; CM-NEXT: CNDE T1.Y, T1.W, 0.0, literal.y, +; CM-NEXT: ADD T0.Z, PV.Y, -T1.X, +; CM-NEXT: MUL_IEEE * T1.W, KC0[3].Z, T0.X, BS:VEC_021/SCL_122 +; CM-NEXT: 1333788672(4.294967e+09), 1107296256(3.200000e+01) +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: ADD T0.Y, PV.X, -T1.Y, +; CM-NEXT: CNDE T1.Z, T2.Y, 0.0, literal.x, +; CM-NEXT: MUL_IEEE * T1.W, KC0[3].Y, T2.X, +; CM-NEXT: 1107296256(3.200000e+01), 0(0.000000e+00) +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: ADD * T0.X, PV.X, -T1.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LOG_IEEE T1.X (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE T1.Y (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[4].X, -; CM-NEXT: LOG_IEEE * T1.W, KC0[4].X, -; CM-NEXT: LOG_IEEE T1.X (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T1.Y (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T1.Z, KC0[3].W, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[3].W, -; CM-NEXT: LOG_IEEE T1.X (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T1.Y, KC0[3].Z, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[3].Z, -; CM-NEXT: LOG_IEEE T1.X, KC0[3].Y, -; CM-NEXT: LOG_IEEE T1.Y (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE T1.Z (MASKED), KC0[3].Y, -; CM-NEXT: LOG_IEEE * T1.W (MASKED), KC0[3].Y, %result = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) store <4 x float> %result, ptr addrspace(1) %out ret void } define float @v_log2_f32(float %in) { -; GFX689-LABEL: v_log2_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32: ; R600: ; %bb.0: @@ -550,18 +1292,63 @@ define float @v_log2_f32(float %in) { } define float @v_log2_fabs_f32(float %in) { -; GFX689-LABEL: v_log2_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_fabs_f32: ; R600: ; %bb.0: @@ -578,18 +1365,63 @@ define float @v_log2_fabs_f32(float %in) { } define float @v_log2_fneg_fabs_f32(float %in) { -; GFX689-LABEL: v_log2_fneg_fabs_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -|v0| -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_fneg_fabs_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_fneg_fabs_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -|v0| -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_fneg_fabs_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_fneg_fabs_f32: ; R600: ; %bb.0: @@ -607,18 +1439,62 @@ define float @v_log2_fneg_fabs_f32(float %in) { } define float @v_log2_fneg_f32(float %in) { -; GFX689-LABEL: v_log2_fneg_f32: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, -v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_fneg_f32: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_fneg_f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e64 v0, -v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_fneg_f32: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_fneg_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_fneg_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_fneg_f32: ; R600: ; %bb.0: @@ -716,18 +1592,61 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" } define float @v_log2_f32_ninf(float %in) { -; GFX689-LABEL: v_log2_f32_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_ninf: ; R600: ; %bb.0: @@ -879,18 +1798,61 @@ define float @v_log2_f32_daz(float %in) #0 { } define float @v_log2_f32_nnan(float %in) { -; GFX689-LABEL: v_log2_f32_nnan: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_nnan: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_nnan: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_nnan: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_nnan: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_nnan: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_nnan: ; R600: ; %bb.0: @@ -933,18 +1895,61 @@ define float @v_log2_f32_nnan_daz(float %in) #0 { } define float @v_log2_f32_nnan_dynamic(float %in) #1 { -; GFX689-LABEL: v_log2_f32_nnan_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_nnan_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_nnan_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_nnan_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_nnan_dynamic: ; R600: ; %bb.0: @@ -987,18 +1992,61 @@ define float @v_log2_f32_ninf_daz(float %in) #0 { } define float @v_log2_f32_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log2_f32_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_ninf_dynamic: ; R600: ; %bb.0: @@ -1014,18 +2062,61 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { } define float @v_log2_f32_nnan_ninf(float %in) { -; GFX689-LABEL: v_log2_f32_nnan_ninf: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_nnan_ninf: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_nnan_ninf: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_nnan_ninf: ; R600: ; %bb.0: @@ -1068,18 +2159,61 @@ define float @v_log2_f32_nnan_ninf_daz(float %in) #0 { } define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { -; GFX689-LABEL: v_log2_f32_nnan_ninf_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_nnan_ninf_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: @@ -1122,18 +2256,61 @@ define float @v_log2_f32_fast_daz(float %in) #0 { } define float @v_log2_f32_dynamic_mode(float %in) #1 { -; GFX689-LABEL: v_log2_f32_dynamic_mode: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_dynamic_mode: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_dynamic_mode: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_dynamic_mode: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_dynamic_mode: ; R600: ; %bb.0: @@ -1149,18 +2326,49 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { } define float @v_log2_f32_undef() { -; GFX689-LABEL: v_log2_f32_undef: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, s4 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_undef: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_undef: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_undef: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX689-GISEL-NEXT: v_mul_f32_e64 v2, s4, 1.0 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_undef: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_undef: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v1, s0, 1.0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_undef: ; R600: ; %bb.0: @@ -1180,6 +2388,7 @@ define float @v_log2_f32_0() { ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, 0 +; GFX689-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_0: @@ -1193,6 +2402,8 @@ define float @v_log2_f32_0() { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, 0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, 0xc2000000, v0 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-GISEL-LABEL: v_log2_f32_0: @@ -1253,8 +2464,16 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_f32_from_fpext_math_f16: @@ -1315,7 +2534,15 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX689-SDAG-LABEL: v_log2_f32_from_fpext_bf16: ; GFX689-SDAG: ; %bb.0: ; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_from_fpext_bf16: @@ -1329,7 +2556,14 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-GISEL-LABEL: v_log2_f32_from_fpext_bf16: @@ -1362,6 +2596,8 @@ define half @v_log2_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_f16: @@ -1411,6 +2647,8 @@ define half @v_log2_fabs_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fabs_f16: @@ -1461,6 +2699,8 @@ define half @v_log2_fneg_fabs_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_fabs_f16: @@ -1512,6 +2752,8 @@ define half @v_log2_fneg_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_f16: @@ -1562,6 +2804,8 @@ define half @v_log2_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_f16_fast: @@ -1614,6 +2858,10 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v2f16: @@ -1694,6 +2942,10 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fabs_v2f16: @@ -1799,6 +3051,10 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_fabs_v2f16: @@ -1905,6 +3161,10 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_v2f16: @@ -2006,6 +3266,10 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v2f16_fast: @@ -2089,6 +3353,12 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v3f16: @@ -2180,6 +3450,12 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v3f16_fast: @@ -2274,6 +3550,14 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16: @@ -2398,6 +3682,14 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16_fast: @@ -2521,3 +3813,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}}