diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d03b4c0f912a8..815b2ddffd1aa 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -970,6 +970,13 @@ The AMDGPU backend implements the following LLVM IR intrinsics. :ref:`llvm.log10 ` Implemented for float and half (and vectors). + :ref:`llvm.exp2 ` Implemented for float and half (and vectors of float or + half). Not implemented for double. Hardware provides + 1ULP accuracy for float, and 0.51ULP for half. Float + instruction does not natively support denormal + inputs. Backend will optimize out denormal scaling if + marked with the :ref:`afn ` flag. + ========================================= ========================================================== .. TODO:: diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 6d078bc8e129a..6a5f15663b142 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14678,6 +14678,8 @@ trapping or setting ``errno``. When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. +.. _int_exp2: + '``llvm.exp2.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 78246609bc740..6cbd33aaa02c7 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -155,6 +155,9 @@ Changes to the AMDGPU Backend accurately. Use llvm.amdgcn.log.f32 to access the old behavior for llvm.log2.f32. +* llvm.exp2.f32 is now lowered accurately. Use llvm.amdgcn.exp2.f32 to + access the old behavior for llvm.exp2.f32. + Changes to the ARM Backend -------------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a6da98c953d8a..da2403fbb4d0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -328,14 +328,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. - setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, + setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT, + ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); - setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom); + setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32, + Custom); setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); @@ -347,7 +348,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); - setOperationAction(ISD::FLOG2, MVT::f16, Custom); + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } setOperationAction({ISD::FLOG10, ISD::FLOG}, MVT::f16, Custom); @@ -1315,6 +1316,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, return LowerFLOGCommon(Op, DAG); case ISD::FEXP: return lowerFEXP(Op, DAG); + case ISD::FEXP2: + return lowerFEXP2(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); @@ -1352,6 +1355,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) Results.push_back(Lowered); return; + case ISD::FEXP2: + if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; default: return; } @@ -2681,6 +2688,54 @@ SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, Flags); } +SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { + // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + SDNodeFlags Flags = Op->getFlags(); + + if (VT == MVT::f16) { + // Nothing in half is a denormal when promoted to f32. + assert(!Subtarget->has16BitInsts()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); + SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); + } + + if (!needsDenormHandlingF32(DAG, Src, Flags)) + return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); + + // bool needs_scaling = x < -0x1.f80000p+6f; + // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); + + // -nextafter(128.0, -1) + SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); + + SDValue NeedsScaling = DAG.getSetCC( + SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, + RangeCheckConst, ISD::SETOLT); + + SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); + SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + + SDValue AddOffset = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); + + SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); + SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); + + SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + SDValue ResultScale = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); + + return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); +} + // exp2(M_LOG2E_F * f); SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 200a7bc4eca87..3d5ba50bc95b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -73,6 +73,7 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, double Log2BaseInverted, SDNodeFlags Flags) const; + SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index a79ff0b55624c..b69cae0c73b3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -397,8 +397,7 @@ def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), (flog2 node:$src)]>; def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), - (AMDGPUexp_impl node:$src), - (fexp2 node:$src)]>; // FIXME: Remove me + (AMDGPUexp_impl node:$src)]>; def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), (fexp2 node:$src)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 6266f337def3b..dfd22435801c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1122,14 +1122,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // FIXME: fpow has a selection pattern that should move to custom lowering. - auto &Exp2Ops = getActionDefinitionsBuilder(G_FEXP2); - if (ST.has16BitInsts()) - Exp2Ops.legalFor({S32, S16}); - else - Exp2Ops.legalFor({S32}); - Exp2Ops.clampScalar(0, MinScalarFPTy, S32); - Exp2Ops.scalarize(0); - auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FPOW}); if (ST.has16BitInsts()) ExpOps.customFor({{S32}, {S16}}); @@ -1142,7 +1134,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, MinScalarFPTy, S32) .lower(); - auto &Log2Ops = getActionDefinitionsBuilder(G_FLOG2); + auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); Log2Ops.customFor({S32}); if (ST.has16BitInsts()) Log2Ops.legalFor({S16}); @@ -2019,6 +2011,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG10: return legalizeFlogCommon(MI, B); + case TargetOpcode::G_FEXP2: + return legalizeFExp2(MI, B); case TargetOpcode::G_FEXP: return legalizeFExp(MI, B); case TargetOpcode::G_FPOW: @@ -3230,6 +3224,64 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, return true; } +bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, + MachineIRBuilder &B) const { + // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + LLT Ty = B.getMRI()->getType(Dst); + const LLT F16 = LLT::scalar(16); + const LLT F32 = LLT::scalar(32); + + if (Ty == F16) { + // Nothing in half is a denormal when promoted to f32. + auto Ext = B.buildFPExt(F32, Src, Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(Dst, Log2, Flags); + MI.eraseFromParent(); + return true; + } + + assert(Ty == F32); + + if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef{Dst}, false) + .addUse(Src) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + // bool needs_scaling = x < -0x1.f80000p+6f; + // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); + + // -nextafter(128.0, -1) + auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); + auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, + RangeCheckConst, Flags); + + auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); + auto Zero = B.buildFConstant(Ty, 0.0); + auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); + auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); + + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) + .addUse(AddInput.getReg(0)) + .setMIFlags(Flags); + + auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); + auto One = B.buildFConstant(Ty, 1.0); + auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); + B.buildFMul(Dst, Exp2, ResultScale, Flags); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 2863d36ce059d..2d4cf4c3865bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -86,6 +86,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, double Log2BaseInverted, unsigned Flags) const; + bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index b7b82cdf8dcce..73c0a0c92b2e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -19,7 +19,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32: @@ -35,7 +43,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32: @@ -51,7 +67,15 @@ define float @v_pow_f32(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32: @@ -64,7 +88,12 @@ define float @v_pow_f32(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32: @@ -80,7 +109,14 @@ define float @v_pow_f32(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -95,20 +131,34 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x42000000 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x800000 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5] ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX6-NEXT: s_mov_b32 s6, 0xc2fc0000 +; GFX6-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc +; GFX6-NEXT: v_cmp_gt_f32_e64 s[4:5], s6, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f32: @@ -119,20 +169,34 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x42000000 -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x800000 +; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5] ; GFX8-NEXT: v_log_f32_e32 v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX8-NEXT: v_log_f32_e32 v1, v1 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX8-NEXT: s_mov_b32 s6, 0xc2fc0000 +; GFX8-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc +; GFX8-NEXT: v_cmp_gt_f32_e64 s[4:5], s6, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] +; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f32: @@ -143,20 +207,34 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x800000 +; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5] ; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX9-NEXT: v_log_f32_e32 v1, v1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_mov_b32 s6, 0xc2fc0000 +; GFX9-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v2, vcc +; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], s6, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f32: @@ -176,8 +254,18 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0xc2fc0000, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s4 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f32: @@ -198,9 +286,20 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0 ; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow @@ -212,9 +311,17 @@ define half @v_pow_f16(half %x, half %y) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -273,16 +380,29 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 ; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -385,12 +505,25 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_exp_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, v1, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -487,21 +620,34 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 +; GFX6-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -602,23 +748,36 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_log_f32_e32 v3, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX6-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_exp_f32_e32 v1, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v2, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -738,7 +897,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs: @@ -754,7 +921,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs: @@ -770,7 +945,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs: @@ -783,7 +966,12 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs: @@ -799,8 +987,15 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) @@ -821,7 +1016,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_rhs: @@ -837,7 +1040,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_rhs: @@ -853,7 +1064,15 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_rhs: @@ -866,7 +1085,12 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_rhs: @@ -882,7 +1106,14 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) @@ -903,7 +1134,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -919,7 +1158,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -935,7 +1182,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -948,7 +1203,12 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -964,8 +1224,15 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) @@ -986,7 +1253,15 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_vgpr: @@ -1001,7 +1276,15 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_vgpr: @@ -1016,7 +1299,15 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_vgpr: @@ -1028,7 +1319,12 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v1, v1 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_vgpr: @@ -1043,8 +1339,15 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1061,9 +1364,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_vgpr_sgpr: @@ -1076,9 +1387,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_vgpr_sgpr: @@ -1091,9 +1410,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_vgpr_sgpr: @@ -1105,7 +1432,12 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_vgpr_sgpr: @@ -1120,7 +1452,14 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1137,9 +1476,17 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_sgpr: @@ -1152,9 +1499,17 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_sgpr: @@ -1167,9 +1522,17 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_sgpr: @@ -1181,7 +1544,12 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_sgpr: @@ -1196,8 +1564,15 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1217,7 +1592,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_lhs: @@ -1233,7 +1616,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_lhs: @@ -1249,7 +1640,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_lhs: @@ -1262,7 +1661,12 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_lhs: @@ -1278,8 +1682,15 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %pow = call float @llvm.pow.f32(float %neg.x, float %y) @@ -1300,7 +1711,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_rhs: @@ -1316,7 +1735,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_rhs: @@ -1332,7 +1759,15 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_rhs: @@ -1345,7 +1780,12 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_rhs: @@ -1361,7 +1801,14 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %pow = call float @llvm.pow.f32(float %x, float %neg.y) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir index ccda3ddb9bf05..10ffcf1342ad4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- -name: fexp2_s32_vs +name: fexp2_s16_vs legalized: true regBankSelected: true tracksRegLiveness: true @@ -11,19 +11,20 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: fexp2_s32_vs + ; CHECK-LABEL: name: fexp2_s16_vs ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %1:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit %1 + ; CHECK-NEXT: [[V_EXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_EXP_F16_e64_]] %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = G_FEXP2 %0 - S_ENDPGM 0, implicit %1 + %1:sgpr(s16) = G_TRUNC %0 + %2:vgpr(s16) = G_FEXP2 %1 + S_ENDPGM 0, implicit %2 ... --- -name: fexp2_s32_vv +name: fexp2_s16_vv legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,13 +33,14 @@ body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: fexp2_s32_vv + ; CHECK-LABEL: name: fexp2_s16_vv ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: %1:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit %1 + ; CHECK-NEXT: [[V_EXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_EXP_F16_e64_]] %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = G_FEXP2 %0 - S_ENDPGM 0, implicit %1 + %1:vgpr(s16) = G_TRUNC %0 + %2:vgpr(s16) = G_FEXP2 %1 + S_ENDPGM 0, implicit %2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir index b6ae2938e6eff..e43e302eb2293 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir @@ -17,24 +17,54 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX8-LABEL: name: test_fexp_s32 ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX8-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX9-LABEL: name: test_fexp_s32 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FEXP %0 $vgpr0 = COPY %1 @@ -52,24 +82,54 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[FMUL]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX8-LABEL: name: test_fexp_s32_nnan ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[C]] - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[FMUL]] - ; GFX8-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX9-LABEL: name: test_fexp_s32_nnan ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[C]] - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[FMUL]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = nnan G_FEXP %0 $vgpr0 = COPY %1 @@ -88,10 +148,25 @@ body: | ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX8-LABEL: name: test_fexp_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 @@ -100,10 +175,25 @@ body: | ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX8-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fexp_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 @@ -112,10 +202,25 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FEXP %0 @@ -135,12 +240,32 @@ body: | ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] - ; GFX6-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL2]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX6-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] + ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL4]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[SELECT4]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX8-LABEL: name: test_fexp_v3s32 ; GFX8: liveins: $vgpr0_vgpr1_vgpr2 @@ -149,12 +274,32 @@ body: | ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX8-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX8-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] - ; GFX8-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL2]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX8-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] + ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL4]](s32), [[C1]] + ; GFX8-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[SELECT4]] + ; GFX8-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C4]], [[C5]] + ; GFX8-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fexp_v3s32 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2 @@ -163,12 +308,32 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[C]] - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] - ; GFX9-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[C]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[C]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL4]](s32), [[C1]] + ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[SELECT4]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C4]], [[C5]] + ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FEXP %0 @@ -189,8 +354,18 @@ body: | ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: test_fexp_s16 @@ -233,16 +408,31 @@ body: | ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %4(s16) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FF7154760000000 ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[C]] - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %5(s16) - ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT1]], [[C]] - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FMUL1]] - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT1]], [[C]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[FMUL2]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[SELECT2]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX6-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir index 296af2b2d4917..adef120554bbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir @@ -15,20 +15,50 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[COPY]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL]](s32) ; GFX8-LABEL: name: test_fexp2_s32 ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[COPY]] - ; GFX8-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: $vgpr0 = COPY [[FMUL]](s32) ; GFX9-LABEL: name: test_fexp2_s32 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[COPY]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_FEXP2 %0 $vgpr0 = COPY %1 @@ -45,27 +75,72 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX8-LABEL: name: test_fexp2_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX8-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fexp2_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_FEXP2 %0 @@ -83,30 +158,90 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX6-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[UV2]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[UV2]], [[SELECT4]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX8-LABEL: name: test_fexp2_v3s32 ; GFX8: liveins: $vgpr0_vgpr1_vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX8-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX8-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX8-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[UV2]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX8-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[UV2]], [[SELECT4]] + ; GFX8-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX8-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fexp2_v3s32 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[UV]] - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[UV1]] - ; GFX9-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[UV2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV]], [[SELECT]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[SELECT2]] + ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT1]], [[SELECT3]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[UV2]], [[SELECT4]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT5]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = G_FEXP2 %0 @@ -125,8 +260,8 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FPEXT]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FPEXT]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: test_fexp2_s16 @@ -168,11 +303,11 @@ body: | ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX6-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX6-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[FPEXT]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FPEXT]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[FPEXT1]] - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FPEXT1]](s32) + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir index b592283dee139..fea0df308b3ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir @@ -27,8 +27,16 @@ body: | ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX9-LABEL: name: test_fpow_s32 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -46,8 +54,16 @@ body: | ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_FPOW %0, %1 @@ -79,16 +95,29 @@ body: | ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV2]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] - ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) - ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] - ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] - ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT4]] + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT3]], [[SELECT5]] + ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) + ; GFX6-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT4]](s32), [[C5]] + ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[SELECT6]] + ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT5]], [[SELECT7]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_fpow_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -109,16 +138,29 @@ body: | ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV2]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] - ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] - ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) - ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] - ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] - ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32) + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT4]] + ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT3]], [[SELECT5]] + ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT4]](s32), [[C5]] + ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[SELECT6]] + ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT5]], [[SELECT7]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 @@ -151,24 +193,42 @@ body: | ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV3]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] - ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) - ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] - ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] - ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] - ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] - ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] - ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX6-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT4]] + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] - ; GFX6-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT4]], [[SELECT5]] - ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) - ; GFX6-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT5]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT3]], [[SELECT5]] + ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) + ; GFX6-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT4]](s32), [[C5]] + ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[SELECT6]] + ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT5]], [[SELECT7]] + ; GFX6-NEXT: [[FCMP4:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX6-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT8]] + ; GFX6-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL4]](s32) + ; GFX6-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT6]], [[SELECT9]] + ; GFX6-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) + ; GFX6-NEXT: [[FCMP5:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT7]](s32), [[C5]] + ; GFX6-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[SELECT10]] + ; GFX6-NEXT: [[INT8:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX6-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[INT8]], [[SELECT11]] + ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_fpow_v3s32 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 @@ -189,24 +249,42 @@ body: | ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[INT]], [[SELECT1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[UV3]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] - ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT2]] - ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL1]](s32) - ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C3]], [[C4]] - ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT2]], [[SELECT3]] - ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) - ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] - ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT3]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s32), [[C]] ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C1]], [[C2]] - ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT4]] - ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[SELECT4]] + ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL2]](s32) ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[FCMP2]](s1), [[C3]], [[C4]] - ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT4]], [[SELECT5]] - ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) - ; GFX9-NEXT: [[FEXP2_2:%[0-9]+]]:_(s32) = G_FEXP2 [[INT5]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FEXP2_]](s32), [[FEXP2_1]](s32), [[FEXP2_2]](s32) + ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[INT3]], [[SELECT5]] + ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB1]](s32), [[UV4]](s32) + ; GFX9-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT4]](s32), [[C5]] + ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[SELECT6]] + ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[FCMP3]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INT5]], [[SELECT7]] + ; GFX9-NEXT: [[FCMP4:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV2]](s32), [[C]] + ; GFX9-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[SELECT8]] + ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FMUL4]](s32) + ; GFX9-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[FCMP4]](s1), [[C3]], [[C4]] + ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[INT6]], [[SELECT9]] + ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB2]](s32), [[UV5]](s32) + ; GFX9-NEXT: [[FCMP5:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT7]](s32), [[C5]] + ; GFX9-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[SELECT10]] + ; GFX9-NEXT: [[INT8:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD2]](s32) + ; GFX9-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[FCMP5]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[INT8]], [[SELECT11]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL1]](s32), [[FMUL3]](s32), [[FMUL5]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 @@ -237,8 +315,16 @@ body: | ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan nsz G_FSUB [[INT]], [[SELECT1]] ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = nnan nsz G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan nsz G_FADD [[INT1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[INT2]], [[SELECT3]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX9-LABEL: name: test_fpow_s32_flags ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -256,8 +342,16 @@ body: | ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan nsz G_FSUB [[INT]], [[SELECT1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = nnan nsz G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan nsz G_FADD [[INT1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[INT2]], [[SELECT3]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = nnan nsz G_FPOW %0, %1 @@ -281,8 +375,18 @@ body: | ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fpow_s16 @@ -333,14 +437,29 @@ body: | ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT1]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[SELECT]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; GFX6-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) - ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT2]](s32), [[FPEXT3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = G_FEXP2 [[INT3]] - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) + ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT3]](s32), [[FPEXT3]](s32) + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[INT4]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[SELECT2]] + ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT5]], [[SELECT3]] + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) @@ -405,14 +524,29 @@ body: | ; GFX6-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[FPEXT1]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nnan nsz G_FCMP floatpred(olt), [[INT1]](s32), [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan nsz G_FADD [[INT1]], [[SELECT]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[INT2]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; GFX6-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) - ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT2]](s32), [[FPEXT3]](s32) - ; GFX6-NEXT: [[FEXP2_1:%[0-9]+]]:_(s32) = nnan nsz G_FEXP2 [[INT3]] - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_1]](s32) + ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT2]](s32) + ; GFX6-NEXT: [[INT4:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT3]](s32), [[FPEXT3]](s32) + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = nnan nsz G_FCMP floatpred(olt), [[INT4]](s32), [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = nnan nsz G_FADD [[INT4]], [[SELECT2]] + ; GFX6-NEXT: [[INT5:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD1]](s32) + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = nnan nsz G_SELECT [[FCMP1]](s1), [[C4]], [[C5]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan nsz G_FMUL [[INT5]], [[SELECT3]] + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir index 2bb9f17566978..3e7f4e8843f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir @@ -20,8 +20,18 @@ body: | ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.log), [[FPEXT]](s32) ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[INT]](s32), [[SITOFP]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[INT1]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[INT1]], [[SELECT]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] + ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT2]], [[SELECT1]] + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fpowi_s16_s32_flags @@ -71,8 +81,16 @@ body: | ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[SITOFP]](s32) - ; GFX6-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] - ; GFX6-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[INT1]], [[SELECT2]] + ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT2]], [[SELECT3]] + ; GFX6-NEXT: $vgpr0 = COPY [[FMUL1]](s32) ; GFX9-LABEL: name: test_fpowi_s32_s32_flags ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -91,8 +109,16 @@ body: | ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP]](s1), [[C3]], [[C4]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nnan G_FSUB [[INT]], [[SELECT1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FSUB]](s32), [[SITOFP]](s32) - ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT1]] - ; GFX9-NEXT: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.260000e+02 + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = nnan G_FCMP floatpred(olt), [[INT1]](s32), [[C5]] + ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 6.400000e+01 + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP1]](s1), [[C6]], [[C4]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[INT1]], [[SELECT2]] + ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), [[FADD]](s32) + ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3BF0000000000000 + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = nnan G_SELECT [[FCMP1]](s1), [[C7]], [[C2]] + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT2]], [[SELECT3]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = nnan G_FPOWI %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index 8c7d0b9b99ec3..b169063d67872 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -9,9 +9,17 @@ define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_exp_f32_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -64,7 +72,15 @@ define float @v_powi_f32(float %l, i32 %r) { ; GFX78-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_f32: @@ -81,7 +97,14 @@ define float @v_powi_f32(float %l, i32 %r) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 %r) ret float %res @@ -129,9 +152,17 @@ define float @v_powi_neg1_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg1_f32: @@ -147,7 +178,14 @@ define float @v_powi_neg1_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -1) ret float %res @@ -165,9 +203,17 @@ define float @v_powi_2_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_2_f32: @@ -183,7 +229,14 @@ define float @v_powi_2_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 2) ret float %res @@ -201,9 +254,17 @@ define float @v_powi_neg2_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg2_f32: @@ -219,7 +280,14 @@ define float @v_powi_neg2_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -2) ret float %res @@ -237,9 +305,17 @@ define float @v_powi_4_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_4_f32: @@ -255,7 +331,14 @@ define float @v_powi_4_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 4) ret float %res @@ -273,9 +356,17 @@ define float @v_powi_8_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_8_f32: @@ -291,7 +382,14 @@ define float @v_powi_8_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 8) ret float %res @@ -309,9 +407,17 @@ define float @v_powi_16_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_16_f32: @@ -327,7 +433,14 @@ define float @v_powi_16_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 16) ret float %res @@ -345,9 +458,17 @@ define float @v_powi_128_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_128_f32: @@ -363,7 +484,14 @@ define float @v_powi_128_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 128) ret float %res @@ -381,9 +509,17 @@ define float @v_powi_neg128_f32(float %l) { ; GFX78-NEXT: v_log_f32_e32 v0, v0 ; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg128_f32: @@ -399,7 +535,14 @@ define float @v_powi_neg128_f32(float %l) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -128) ret float %res diff --git a/llvm/test/CodeGen/AMDGPU/input-mods.ll b/llvm/test/CodeGen/AMDGPU/input-mods.ll index 720790df7e166..760507f7b9556 100644 --- a/llvm/test/CodeGen/AMDGPU/input-mods.ll +++ b/llvm/test/CodeGen/AMDGPU/input-mods.ll @@ -13,7 +13,7 @@ define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = call float @llvm.fabs.f32(float %r0) %r2 = fsub float -0.000000e+00, %r1 - %r3 = call float @llvm.exp2.f32(float %r2) + %r3 = call afn float @llvm.exp2.f32(float %r2) %vec = insertelement <4 x float> undef, float %r3, i32 0 call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 89dd726019c68..5478768befcda 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -10,71 +10,174 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { -; VI-LABEL: s_exp_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-NEXT: v_exp_f32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX900-LABEL: s_exp_f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-NEXT: global_store_dword v1, v0, s[2:3] -; GFX900-NEXT: s_endpgm -; -; SI-LABEL: s_exp_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, s2, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm +; VI-SDAG-LABEL: s_exp_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; VI-SDAG-NEXT: s_mov_b32 s3, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_exp_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_exp_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_exp_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX900-GISEL-NEXT: s_endpgm +; +; SI-SDAG-LABEL: s_exp_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_exp_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: ; R600-NEXT: MUL_IEEE * T0.W, KC0[2].Z, literal.x, ; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T1.X, PV.W, +; R600-NEXT: SETGT * T1.W, literal.x, PV.W, +; R600-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.W, T0.W, PV.W, +; R600-NEXT: CNDE * T1.W, T1.W, 1.0, literal.x, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MUL_IEEE T0.X, PS, T1.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; CM-NEXT: MUL_IEEE * T0.W, KC0[2].Z, literal.y, -; CM-NEXT: 2(2.802597e-45), 1069066811(1.442695e+00) -; CM-NEXT: EXP_IEEE T1.X, T0.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T0.W, KC0[2].Z, literal.x, +; CM-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) +; CM-NEXT: SETGT * T1.W, literal.x, PV.W, +; CM-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; CM-NEXT: CNDE * T2.W, PV.W, 0.0, literal.x, +; CM-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, T1.W, 1.0, literal.x, +; CM-NEXT: ADD * T0.W, T0.W, PV.W, BS:VEC_120/SCL_212 +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T0.X, PV.X, T0.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call float @llvm.exp.f32(float %in) store float %result, ptr addrspace(1) %out ret void @@ -85,126 +188,233 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; VI-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v2, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 +; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s3, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v0 +; SI-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MUL_IEEE T0.W, KC0[3].X, literal.x, -; R600-NEXT: MUL_IEEE * T1.W, KC0[2].W, literal.x, +; R600-NEXT: MUL_IEEE * T0.W, KC0[3].X, literal.x, ; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; R600-NEXT: EXP_IEEE * T0.Y, PV.W, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T0.X, T1.W, +; R600-NEXT: SETGT T1.W, literal.x, PV.W, +; R600-NEXT: MUL_IEEE * T2.W, KC0[2].W, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1069066811(1.442695e+00) +; R600-NEXT: SETGT T3.W, literal.x, PS, +; R600-NEXT: CNDE * T4.W, PV.W, 0.0, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1115684864(6.400000e+01) +; R600-NEXT: ADD T0.W, T0.W, PS, +; R600-NEXT: CNDE * T4.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Z, T2.W, PS, +; R600-NEXT: CNDE T1.W, T1.W, 1.0, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, PS, PV.W, +; R600-NEXT: CNDE T0.W, T3.W, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Z, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.X, PS, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; CM-NEXT: MUL_IEEE * T0.Z, KC0[2].W, literal.y, -; CM-NEXT: 2(2.802597e-45), 1069066811(1.442695e+00) ; CM-NEXT: MUL_IEEE * T0.W, KC0[3].X, literal.x, ; CM-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T1.X (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T1.Y, T0.W, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T1.X, T0.Z, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.Z, +; CM-NEXT: MUL_IEEE T0.Z, KC0[2].W, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, PV.W, +; CM-NEXT: 1069066811(1.442695e+00), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T1.Z, PV.W, 0.0, literal.x, +; CM-NEXT: SETGT * T2.W, literal.y, PV.Z, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.Y, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T2.Z, T1.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, T0.W, PV.Z, BS:VEC_120/SCL_212 +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T1.Y, PV.X, T2.Z, +; CM-NEXT: CNDE T1.Z, T2.W, 1.0, literal.x, +; CM-NEXT: ADD * T0.W, T0.Z, T0.Y, +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T1.X, PV.X, T1.Z, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <2 x float> @llvm.exp.v2f32(<2 x float> %in) store <2 x float> %result, ptr addrspace(1) %out ret void @@ -214,36 +424,73 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s5, v0 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s7, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; VI-SDAG-NEXT: v_exp_f32_e32 v2, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v3 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s7, v4 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s7, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v1, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v5, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; @@ -252,14 +499,32 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s5, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v3 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -267,106 +532,203 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v1, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v2, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v2 -; SI-SDAG-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s10, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v4, v0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, s5 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v2, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v5, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: ALU 33, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MUL_IEEE T0.W, KC0[3].Z, literal.x, -; R600-NEXT: MUL_IEEE * T1.W, KC0[3].Y, literal.x, +; R600-NEXT: MUL_IEEE * T0.W, KC0[3].Z, literal.x, ; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) +; R600-NEXT: SETGT T1.W, literal.x, PV.W, +; R600-NEXT: MUL_IEEE * T2.W, KC0[3].Y, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1069066811(1.442695e+00) +; R600-NEXT: SETGT T3.W, literal.x, PS, +; R600-NEXT: CNDE * T4.W, PV.W, 0.0, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1115684864(6.400000e+01) +; R600-NEXT: ADD T0.Z, T0.W, PS, +; R600-NEXT: MUL_IEEE T0.W, KC0[3].W, literal.x, +; R600-NEXT: CNDE * T4.W, PV.W, 0.0, literal.y, +; R600-NEXT: 1069066811(1.442695e+00), 1115684864(6.400000e+01) +; R600-NEXT: ADD T0.Y, T2.W, PS, +; R600-NEXT: SETGT T1.Z, literal.x, PV.W, +; R600-NEXT: CNDE T1.W, T1.W, 1.0, literal.y, BS:VEC_120/SCL_212 +; R600-NEXT: EXP_IEEE * T0.X, PV.Z, +; R600-NEXT: -1023672320(-1.260000e+02), 528482304(5.421011e-20) +; R600-NEXT: MUL_IEEE T1.Y, PS, PV.W, +; R600-NEXT: CNDE T0.Z, PV.Z, 0.0, literal.x, +; R600-NEXT: CNDE T1.W, T3.W, 1.0, literal.y, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; R600-NEXT: MUL_IEEE T1.X, PS, PV.W, +; R600-NEXT: ADD T0.W, T0.W, PV.Z, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: CNDE T1.W, T1.Z, 1.0, literal.x, ; R600-NEXT: EXP_IEEE * T0.Y, PV.W, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: MUL_IEEE T0.Z, KC0[3].W, literal.y, -; R600-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z, -; R600-NEXT: EXP_IEEE * T0.X, T1.W, -; R600-NEXT: 2(2.802597e-45), 1069066811(1.442695e+00) +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PS, PV.W, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; R600-NEXT: LSHR T2.X, PV.W, literal.x, -; R600-NEXT: EXP_IEEE * T3.X, PV.Z, +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 19, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 39, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3, T0.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; CM-NEXT: LSHR T1.X, PV.W, literal.x, -; CM-NEXT: MUL_IEEE T0.Y, KC0[3].Y, literal.y, -; CM-NEXT: MUL_IEEE T0.Z, KC0[3].Z, literal.y, -; CM-NEXT: MUL_IEEE * T0.W, KC0[3].W, literal.y, -; CM-NEXT: 2(2.802597e-45), 1069066811(1.442695e+00) -; CM-NEXT: EXP_IEEE T2.X, T0.W, -; CM-NEXT: EXP_IEEE T2.Y (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T2.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T2.W (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T3.X (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T3.Y, T0.Z, -; CM-NEXT: EXP_IEEE T3.Z (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE * T3.W (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T3.X, T0.Y, -; CM-NEXT: EXP_IEEE T3.Y (MASKED), T0.Y, -; CM-NEXT: EXP_IEEE T3.Z (MASKED), T0.Y, -; CM-NEXT: EXP_IEEE * T3.W (MASKED), T0.Y, +; CM-NEXT: MUL_IEEE * T0.W, KC0[3].W, literal.x, +; CM-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.Y, KC0[3].Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, KC0[3].Z, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, PV.W, +; CM-NEXT: 1069066811(1.442695e+00), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T1.Y, PV.W, 0.0, literal.x, +; CM-NEXT: SETGT T1.Z, literal.y, PV.Z, +; CM-NEXT: SETGT * T2.W, literal.y, PV.Y, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.X, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T2.Y, PV.Z, 0.0, literal.x, +; CM-NEXT: CNDE T2.Z, T1.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, T0.W, PV.Y, BS:VEC_120/SCL_212 +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W, T0.W, +; CM-NEXT: MUL_IEEE T1.X, PV.W, T2.Z, +; CM-NEXT: CNDE T1.Y, T1.Z, 1.0, literal.x, +; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, +; CM-NEXT: ADD * T0.W, T0.Z, T2.Y, BS:VEC_201 +; CM-NEXT: 528482304(5.421011e-20), 8(1.121039e-44) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z, T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: LSHR T2.X, T1.Z, literal.x, +; CM-NEXT: MUL_IEEE T3.Y, PV.Z, T1.Y, +; CM-NEXT: CNDE T0.Z, T2.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, T0.Y, T0.X, +; CM-NEXT: 2(2.802597e-45), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T3.X, PV.X, T0.Z, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in) store <3 x float> %result, ptr addrspace(1) %out ret void @@ -378,41 +740,87 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b +; VI-SDAG-NEXT: s_mov_b32 s8, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 -; VI-SDAG-NEXT: v_exp_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s8, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s8, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v1, s[0:1] ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[0:1] +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s5, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s8, v5 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s8, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v4, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v5, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s7 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v3, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v6 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v5, s[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v6, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; @@ -421,16 +829,39 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: s_mov_b32 s8, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s8, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v1 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s8, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x1f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v6, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, v6, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s8, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v5, vcc +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s8, v0 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v6, vcc +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, v6, s[0:1] +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -438,134 +869,304 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v4, s[0:1] ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v5, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v2, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v6, v6 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s2, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v1 +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v5, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v3, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, v5, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s2, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, s7 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v5, s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v6 +; SI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], v2, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_exp_f32_e32 v6, v6 +; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v6, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MUL_IEEE T0.W, KC0[4].X, literal.x, -; R600-NEXT: MUL_IEEE * T1.W, KC0[3].W, literal.x, -; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; R600-NEXT: EXP_IEEE * T0.W, PV.W, -; R600-NEXT: MUL_IEEE T2.W, KC0[3].Z, literal.x, -; R600-NEXT: EXP_IEEE * T0.Z, T1.W, -; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T1.W, KC0[3].Y, literal.x, -; R600-NEXT: EXP_IEEE * T0.Y, PV.W, +; R600-NEXT: MUL_IEEE * T0.W, KC0[4].X, literal.x, ; R600-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: SETGT T1.W, literal.x, PV.W, +; R600-NEXT: MUL_IEEE * T2.W, KC0[3].W, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1069066811(1.442695e+00) +; R600-NEXT: SETGT T0.Z, literal.x, PS, +; R600-NEXT: MUL_IEEE T3.W, KC0[3].Z, literal.y, +; R600-NEXT: CNDE * T4.W, PV.W, 0.0, literal.z, +; R600-NEXT: -1023672320(-1.260000e+02), 1069066811(1.442695e+00) +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Y, T0.W, PS, +; R600-NEXT: SETGT T1.Z, literal.x, PV.W, +; R600-NEXT: MUL_IEEE T0.W, KC0[3].Y, literal.y, +; R600-NEXT: CNDE * T4.W, PV.Z, 0.0, literal.z, +; R600-NEXT: -1023672320(-1.260000e+02), 1069066811(1.442695e+00) +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, T2.W, PS, +; R600-NEXT: CNDE T1.Y, T1.W, 1.0, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: SETGT T2.Z, literal.y, PV.W, +; R600-NEXT: CNDE T1.W, PV.Z, 0.0, literal.z, +; R600-NEXT: EXP_IEEE * T0.Y, PV.Y, +; R600-NEXT: 528482304(5.421011e-20), -1023672320(-1.260000e+02) +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T1.X, T3.W, PV.W, +; R600-NEXT: CNDE T2.Y, T0.Z, 1.0, literal.x, +; R600-NEXT: CNDE T0.Z, PV.Z, 0.0, literal.y, +; R600-NEXT: MUL_IEEE T3.W, PS, PV.Y, +; R600-NEXT: EXP_IEEE * T0.X, PV.X, +; R600-NEXT: 528482304(5.421011e-20), 1115684864(6.400000e+01) +; R600-NEXT: ADD T0.Y, T0.W, PV.Z, +; R600-NEXT: MUL_IEEE T3.Z, PS, PV.Y, +; R600-NEXT: CNDE T0.W, T1.Z, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.X, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T3.Y, PS, PV.W, +; R600-NEXT: CNDE T0.W, T2.Z, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T3.X, PS, PV.W, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp_v4f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X +; CM-NEXT: ALU 49, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; CM-NEXT: MUL_IEEE * T0.W, KC0[3].Y, literal.y, -; CM-NEXT: 2(2.802597e-45), 1069066811(1.442695e+00) -; CM-NEXT: MUL_IEEE T0.Y, KC0[3].Z, literal.x, -; CM-NEXT: MUL_IEEE T0.Z, KC0[3].W, literal.x, -; CM-NEXT: MUL_IEEE * T1.W, KC0[4].X, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, KC0[3].Z, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, KC0[4].X, literal.x, ; CM-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T1.X (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T1.W, T1.W, -; CM-NEXT: EXP_IEEE T1.X (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T1.Z, T0.Z, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.Z, -; CM-NEXT: EXP_IEEE T1.X (MASKED), T0.Y, -; CM-NEXT: EXP_IEEE T1.Y, T0.Y, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T0.Y, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.Y, -; CM-NEXT: EXP_IEEE T1.X, T0.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T0.Y, KC0[3].W, literal.x, +; CM-NEXT: SETGT T1.Z, literal.y, PV.W, +; CM-NEXT: SETGT * T1.W, literal.y, PV.Z, +; CM-NEXT: 1069066811(1.442695e+00), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T1.Y, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T2.Z, PV.Z, 0.0, literal.x, +; CM-NEXT: SETGT * T2.W, literal.y, PV.Y, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.X, T1.Z, 1.0, literal.x, +; CM-NEXT: CNDE T2.Y, PV.W, 0.0, literal.y, +; CM-NEXT: MUL_IEEE T1.Z, KC0[3].Y, literal.z, +; CM-NEXT: ADD * T0.W, T0.W, PV.Z, +; CM-NEXT: 528482304(5.421011e-20), 1115684864(6.400000e+01) +; CM-NEXT: 1069066811(1.442695e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W, T0.W, +; CM-NEXT: CNDE T1.X, T2.W, 1.0, literal.x, +; CM-NEXT: SETGT T3.Y, literal.y, T1.Z, +; CM-NEXT: ADD T2.Z, T0.Y, T2.Y, +; CM-NEXT: MUL_IEEE * T2.W, PV.W, T0.X, +; CM-NEXT: 528482304(5.421011e-20), -1023672320(-1.260000e+02) +; CM-NEXT: EXP_IEEE T0.X, T2.Z, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T2.Z, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T2.Z, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T2.Z, +; CM-NEXT: CNDE T2.X, T3.Y, 0.0, literal.x, +; CM-NEXT: CNDE T0.Y, T1.W, 1.0, literal.y, +; CM-NEXT: MUL_IEEE T2.Z, PV.X, T1.X, +; CM-NEXT: ADD * T0.W, T0.Z, T1.Y, +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T2.Y, PV.X, T0.Y, +; CM-NEXT: CNDE T0.Z, T3.Y, 1.0, literal.x, +; CM-NEXT: ADD * T0.W, T1.Z, T2.X, +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T2.X, PV.X, T0.Z, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <4 x float> @llvm.exp.v4f32(<4 x float> %in) store <4 x float> %result, ptr addrspace(1) %out ret void } define float @v_exp_f32(float %in) { -; GCN-LABEL: v_exp_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32: ; R600: ; %bb.0: @@ -586,7 +1187,15 @@ define float @v_exp_fabs_f32(float %in) { ; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; GCN-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, s4 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp_fabs_f32: @@ -594,7 +1203,15 @@ define float @v_exp_fabs_f32(float %in) { ; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GCN-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_fabs_f32: @@ -602,7 +1219,15 @@ define float @v_exp_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, s4 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fabs_f32: @@ -610,7 +1235,15 @@ define float @v_exp_fabs_f32(float %in) { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fabs_f32: @@ -633,7 +1266,15 @@ define float @v_exp_fneg_fabs_f32(float %in) { ; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SDAG-NEXT: s_mov_b32 s4, 0xbfb8aa3b ; GCN-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, s4 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp_fneg_fabs_f32: @@ -641,7 +1282,15 @@ define float @v_exp_fneg_fabs_f32(float %in) { ; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GCN-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_fneg_fabs_f32: @@ -649,7 +1298,15 @@ define float @v_exp_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0xbfb8aa3b ; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, s4 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_fabs_f32: @@ -657,7 +1314,15 @@ define float @v_exp_fneg_fabs_f32(float %in) { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_f32: @@ -680,7 +1345,15 @@ define float @v_exp_fneg_f32(float %in) { ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp_fneg_f32: @@ -688,14 +1361,30 @@ define float @v_exp_fneg_f32(float %in) { ; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GCN-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_fneg_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_f32: @@ -703,7 +1392,15 @@ define float @v_exp_fneg_f32(float %in) { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_f32: @@ -805,19 +1502,65 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { } define float @v_exp_f32_ninf(float %in) { -; GCN-LABEL: v_exp_f32_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_ninf: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_ninf: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_ninf: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_ninf: ; R600: ; %bb.0: @@ -992,19 +1735,65 @@ define float @v_exp_f32_daz(float %in) #0 { } define float @v_exp_f32_nnan(float %in) { -; GCN-LABEL: v_exp_f32_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_nnan: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp_f32_nnan: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_nnan: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp_f32_nnan: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_nnan: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_nnan: ; R600: ; %bb.0: @@ -1048,19 +1837,65 @@ define float @v_exp_f32_nnan_daz(float %in) #0 { } define float @v_exp_f32_nnan_dynamic(float %in) #1 { -; GCN-LABEL: v_exp_f32_nnan_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_nnan_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_nnan_dynamic: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_nnan_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_nnan_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_nnan_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_nnan_dynamic: ; R600: ; %bb.0: @@ -1104,19 +1939,65 @@ define float @v_exp_f32_ninf_daz(float %in) #0 { } define float @v_exp_f32_ninf_dynamic(float %in) #1 { -; GCN-LABEL: v_exp_f32_ninf_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_ninf_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_ninf_dynamic: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_ninf_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_ninf_dynamic: ; R600: ; %bb.0: @@ -1132,19 +2013,65 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 { } define float @v_exp_f32_nnan_ninf(float %in) { -; GCN-LABEL: v_exp_f32_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_nnan_ninf: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_nnan_ninf: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_nnan_ninf: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_nnan_ninf: ; R600: ; %bb.0: @@ -1188,19 +2115,65 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 { } define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 { -; GCN-LABEL: v_exp_f32_nnan_ninf_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_nnan_ninf_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_nnan_ninf_dynamic: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_nnan_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: @@ -1244,19 +2217,65 @@ define float @v_exp_f32_fast_daz(float %in) #0 { } define float @v_exp_f32_dynamic_mode(float %in) #1 { -; GCN-LABEL: v_exp_f32_dynamic_mode: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_dynamic_mode: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_dynamic_mode: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_dynamic_mode: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_dynamic_mode: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_mac_f32_e32 v2, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_dynamic_mode: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_mac_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_dynamic_mode: ; R600: ; %bb.0: @@ -1283,7 +2302,15 @@ define float @v_exp_f32_undef() { ; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_f32_undef: @@ -1297,7 +2324,15 @@ define float @v_exp_f32_undef() { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_undef: @@ -1340,21 +2375,69 @@ define float @v_exp_f32_0() { } define float @v_exp_f32_from_fpext_f16(i16 %src.i) { -; GCN-LABEL: v_exp_f32_from_fpext_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_from_fpext_f16: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; SI-LABEL: v_exp_f32_from_fpext_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-GISEL-LABEL: v_exp_f32_from_fpext_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SI-SDAG-LABEL: v_exp_f32_from_fpext_f16: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_exp_f32_from_fpext_f16: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_from_fpext_f16: ; R600: ; %bb.0: @@ -1372,23 +2455,56 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) { } define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { -; GCN-LABEL: v_exp_f32_from_fpext_math_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_f16_e32 v0, v0, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp_f32_from_fpext_math_f16: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_add_f16_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp_f32_from_fpext_math_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_add_f16_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16: @@ -1396,11 +2512,19 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_from_fpext_math_f16: @@ -1425,30 +2549,62 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) { ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp_f32_from_fpext_bf16: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; SI-SDAG-LABEL: v_exp_f32_from_fpext_bf16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f32_from_fpext_bf16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_from_fpext_bf16: @@ -1576,17 +2732,33 @@ define half @v_exp_f16(half %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,17 +2796,33 @@ define half @v_exp_fabs_f16(half %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fabs_f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1673,17 +2861,33 @@ define half @v_exp_fneg_fabs_f16(half %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_fabs_f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1722,17 +2926,33 @@ define half @v_exp_fneg_f16(half %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1762,9 +2982,17 @@ define half @v_exp_f16_fast(half %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f16_fast: @@ -1838,12 +3066,25 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v2f16: @@ -1851,10 +3092,23 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1925,12 +3179,25 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fabs_v2f16: @@ -1943,12 +3210,25 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v1, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v2, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fabs_v2f16: @@ -2018,16 +3298,29 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v2, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_fabs_v2f16: @@ -2040,12 +3333,25 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v1, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v2, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_v2f16: @@ -2115,16 +3421,29 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v2, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_v2f16: @@ -2137,12 +3456,25 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v1, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v2, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_v2f16: @@ -2208,12 +3540,25 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v2f16_fast: @@ -2288,30 +3633,67 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v3f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v5 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x1f800000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v5, vcc ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v6 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2375,15 +3757,33 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v4, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v4, s[4:5] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v3f16_afn: @@ -2430,3 +3830,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX900: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index abb3cd134630d..0e43cb7c1aef2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -10,62 +10,158 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { -; SI-LABEL: s_exp2_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_exp_f32_e32 v0, s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: s_exp2_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_exp_f32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX900-LABEL: s_exp2_f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_exp_f32_e32 v0, s4 -; GFX900-NEXT: global_store_dword v1, v0, s[2:3] -; GFX900-NEXT: s_endpgm +; SI-SDAG-LABEL: s_exp2_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_exp2_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_exp2_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_exp2_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_exp2_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_exp2_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T1.X, KC0[2].Z, +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T1.W, KC0[2].Z, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 1.0, literal.x, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MUL_IEEE T0.X, PS, T0.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp2_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 0.0, literal.x, +; CM-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, T0.W, 1.0, literal.x, +; CM-NEXT: ADD * T0.W, KC0[2].Z, PV.W, +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T0.X, PV.X, T0.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T1.X, KC0[2].Z, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), KC0[2].Z, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[2].Z, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[2].Z, %result = call float @llvm.exp2.f32(float %in) store float %result, ptr addrspace(1) %out ret void @@ -77,33 +173,74 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_exp_f32_e32 v1, s3 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, s2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v3, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_exp_f32_e32 v0, s2 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, s3 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v2, s6, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_exp_f32_e32 v1, s3 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -111,64 +248,136 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_exp_f32_e32 v0, s2 -; VI-GISEL-NEXT: v_exp_f32_e32 v1, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v2, s6, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 +; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, s3 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, s2 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, s2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, s3 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v1, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: EXP_IEEE * T0.Y, KC0[3].X, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T0.X, KC0[2].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].W, +; R600-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T2.W, KC0[3].X, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Z, KC0[2].W, PS, +; R600-NEXT: CNDE T0.W, T0.W, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, PS, PV.W, +; R600-NEXT: CNDE T0.W, T1.W, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Z, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.X, PS, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp2_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].X, +; CM-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Z, PV.W, 0.0, literal.x, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[2].W, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.Y, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, KC0[3].X, PV.Z, +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T1.Y, PV.X, T1.Z, +; CM-NEXT: CNDE T0.Z, T1.W, 1.0, literal.x, +; CM-NEXT: ADD * T0.W, KC0[2].W, T0.Y, +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T1.X, PV.X, T0.Z, ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T1.X (MASKED), KC0[3].X, -; CM-NEXT: EXP_IEEE T1.Y, KC0[3].X, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[3].X, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[3].X, -; CM-NEXT: EXP_IEEE T1.X, KC0[2].W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), KC0[2].W, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[2].W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[2].W, %result = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) store <2 x float> %result, ptr addrspace(1) %out ret void @@ -179,12 +388,30 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v4, s5, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s4, v6 +; SI-SDAG-NEXT: v_exp_f32_e32 v3, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, s4 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v3, v7 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v6, v5 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm @@ -192,26 +419,65 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, s6 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; SI-GISEL-NEXT: v_exp_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, s4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, s6, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_exp_f32_e32 v3, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -219,13 +485,34 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_exp_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v4, v4 +; VI-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; @@ -233,67 +520,146 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s6, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v6, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, s6 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v4, v4 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: EXP_IEEE * T0.Y, KC0[3].Z, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y, -; R600-NEXT: EXP_IEEE * T0.X, KC0[3].Y, -; R600-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; R600-NEXT: LSHR T2.X, PV.W, literal.x, -; R600-NEXT: EXP_IEEE * T3.X, KC0[3].W, +; R600-NEXT: SETGT T0.W, literal.x, KC0[3].Z, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].Y, +; R600-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T2.W, KC0[3].Z, PV.W, +; R600-NEXT: CNDE * T3.W, T1.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Y, KC0[3].Y, PS, +; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].W, +; R600-NEXT: CNDE T0.W, T0.W, 1.0, literal.y, +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: -1023672320(-1.260000e+02), 528482304(5.421011e-20) +; R600-NEXT: MUL_IEEE T1.Y, PS, PV.W, +; R600-NEXT: CNDE T1.Z, PV.Z, 0.0, literal.x, +; R600-NEXT: CNDE T0.W, T1.W, 1.0, literal.y, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; R600-NEXT: MUL_IEEE T1.X, PS, PV.W, +; R600-NEXT: ADD T0.W, KC0[3].W, PV.Z, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: CNDE T1.W, T0.Z, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.Y, PV.W, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PS, PV.W, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp2_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3, T1.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X +; CM-NEXT: ALU 35, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR * T0.X, PV.W, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[3].W, +; CM-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 0.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Y, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.X, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T1.Y, PV.Z, 0.0, literal.x, +; CM-NEXT: CNDE T1.Z, T0.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, KC0[3].W, PV.Y, +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y, T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE T1.X, PV.Y, T1.Z, +; CM-NEXT: CNDE T0.Y, T0.Z, 1.0, literal.x, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y, +; CM-NEXT: ADD * T0.W, KC0[3].Z, T1.Y, +; CM-NEXT: 528482304(5.421011e-20), 8(1.121039e-44) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W, T0.W, +; CM-NEXT: LSHR T2.X, T0.Z, literal.x, +; CM-NEXT: MUL_IEEE T0.Y, PV.W, T0.Y, +; CM-NEXT: CNDE T0.Z, T1.W, 1.0, literal.y, +; CM-NEXT: ADD * T0.W, KC0[3].Y, T0.X, +; CM-NEXT: 2(2.802597e-45), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: MUL_IEEE * T0.X, PV.X, T0.Z, +; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T2.X, KC0[3].W, -; CM-NEXT: EXP_IEEE T2.Y (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE T2.Z (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE * T2.W (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE T3.X (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE T3.Y, KC0[3].Z, -; CM-NEXT: EXP_IEEE T3.Z (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE * T3.W (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE T3.X, KC0[3].Y, -; CM-NEXT: EXP_IEEE T3.Y (MASKED), KC0[3].Y, -; CM-NEXT: EXP_IEEE T3.Z (MASKED), KC0[3].Y, -; CM-NEXT: EXP_IEEE * T3.W (MASKED), KC0[3].Y, %result = call <3 x float> @llvm.exp2.v3f32(<3 x float> %in) store <3 x float> %result, ptr addrspace(1) %out ret void @@ -304,42 +670,114 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_exp_f32_e32 v3, s7 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, s4 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 +; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, s6 -; SI-GISEL-NEXT: v_exp_f32_e32 v3, s7 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-GISEL-NEXT: s_mov_b32 s10, -1 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_exp_f32_e32 v3, s7 -; VI-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, s4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v8, v8 +; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -347,14 +785,40 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; VI-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; VI-GISEL-NEXT: v_exp_f32_e32 v2, s6 -; VI-GISEL-NEXT: v_exp_f32_e32 v3, s7 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; VI-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; @@ -362,12 +826,35 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, s7 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, s6 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, s5 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, s4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s7, v5 +; GFX900-SDAG-NEXT: v_add_f32_e32 v7, s6, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v9, s5, v9 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v5 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v9, v9 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v10, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, v5, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -375,65 +862,167 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, s4 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, s5 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, s6 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, s7 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 33, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: EXP_IEEE * T0.W, KC0[4].X, -; R600-NEXT: EXP_IEEE * T0.Z, KC0[3].W, -; R600-NEXT: EXP_IEEE * T0.Y, KC0[3].Z, -; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: EXP_IEEE * T0.X, KC0[3].Y, +; R600-NEXT: SETGT T0.W, literal.x, KC0[4].X, +; R600-NEXT: SETGT * T1.W, literal.x, KC0[3].W, +; R600-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; R600-NEXT: CNDE * T2.W, PV.W, 0.0, literal.x, +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.Z, KC0[4].X, PV.W, +; R600-NEXT: SETGT T2.W, literal.x, KC0[3].Z, +; R600-NEXT: CNDE * T3.W, T1.W, 0.0, literal.y, +; R600-NEXT: -1023672320(-1.260000e+02), 1115684864(6.400000e+01) +; R600-NEXT: ADD T0.X, KC0[3].W, PS, +; R600-NEXT: CNDE T0.Y, T0.W, 1.0, literal.x, +; R600-NEXT: SETGT T1.Z, literal.y, KC0[3].Y, +; R600-NEXT: CNDE T0.W, PV.W, 0.0, literal.z, +; R600-NEXT: EXP_IEEE * T0.Z, PV.Z, +; R600-NEXT: 528482304(5.421011e-20), -1023672320(-1.260000e+02) +; R600-NEXT: 1115684864(6.400000e+01), 0(0.000000e+00) +; R600-NEXT: ADD T1.X, KC0[3].Z, PV.W, +; R600-NEXT: CNDE T1.Y, T1.W, 1.0, literal.x, +; R600-NEXT: CNDE T2.Z, PV.Z, 0.0, literal.y, +; R600-NEXT: MUL_IEEE T0.W, PS, PV.Y, +; R600-NEXT: EXP_IEEE * T0.X, PV.X, +; R600-NEXT: 528482304(5.421011e-20), 1115684864(6.400000e+01) +; R600-NEXT: ADD T2.Y, KC0[3].Y, PV.Z, +; R600-NEXT: MUL_IEEE T0.Z, PS, PV.Y, +; R600-NEXT: CNDE T1.W, T2.W, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.X, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.Y, PS, PV.W, +; R600-NEXT: CNDE T1.W, T1.Z, 1.0, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T0.X, PS, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_exp2_v4f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X +; CM-NEXT: ALU 43, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: SETGT * T0.W, literal.x, KC0[4].X, +; CM-NEXT: -1023672320(-1.260000e+02), 0(0.000000e+00) +; CM-NEXT: CNDE T0.Y, PV.W, 0.0, literal.x, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].W, +; CM-NEXT: SETGT * T1.W, literal.y, KC0[3].Z, +; CM-NEXT: 1115684864(6.400000e+01), -1023672320(-1.260000e+02) +; CM-NEXT: CNDE T0.X, PV.W, 0.0, literal.x, +; CM-NEXT: CNDE T1.Y, T0.W, 1.0, literal.y, +; CM-NEXT: CNDE T1.Z, PV.Z, 0.0, literal.x, +; CM-NEXT: ADD * T0.W, KC0[4].X, PV.Y, +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Y, T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: CNDE T1.X, T0.Z, 1.0, literal.x, +; CM-NEXT: SETGT T2.Y, literal.y, KC0[3].Y, +; CM-NEXT: ADD T0.Z, KC0[3].W, T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, PV.Y, T1.Y, +; CM-NEXT: 528482304(5.421011e-20), -1023672320(-1.260000e+02) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.Z, +; CM-NEXT: EXP_IEEE T0.Y, T0.Z, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.Z, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.Z, +; CM-NEXT: CNDE T2.X, T2.Y, 0.0, literal.x, +; CM-NEXT: CNDE T1.Y, T1.W, 1.0, literal.y, +; CM-NEXT: MUL_IEEE T0.Z, PV.Y, T1.X, +; CM-NEXT: ADD * T1.W, KC0[3].Z, T0.X, BS:VEC_021/SCL_122 +; CM-NEXT: 1115684864(6.400000e+01), 528482304(5.421011e-20) +; CM-NEXT: EXP_IEEE T0.X, T1.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: MUL_IEEE T0.Y, PV.X, T1.Y, +; CM-NEXT: CNDE T1.Z, T2.Y, 1.0, literal.x, +; CM-NEXT: ADD * T1.W, KC0[3].Y, T2.X, +; CM-NEXT: 528482304(5.421011e-20), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T1.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: MUL_IEEE * T0.X, PV.X, T1.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T1.X (MASKED), KC0[4].X, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), KC0[4].X, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[4].X, -; CM-NEXT: EXP_IEEE * T1.W, KC0[4].X, -; CM-NEXT: EXP_IEEE T1.X (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE T1.Z, KC0[3].W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[3].W, -; CM-NEXT: EXP_IEEE T1.X (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE T1.Y, KC0[3].Z, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[3].Z, -; CM-NEXT: EXP_IEEE T1.X, KC0[3].Y, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), KC0[3].Y, -; CM-NEXT: EXP_IEEE T1.Z (MASKED), KC0[3].Y, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), KC0[3].Y, %result = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) store <4 x float> %result, ptr addrspace(1) %out ret void } define float @v_exp2_f32(float %in) { -; GCN-LABEL: v_exp2_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32: ; R600: ; %bb.0: @@ -449,11 +1038,33 @@ define float @v_exp2_f32(float %in) { } define float @v_exp2_fabs_f32(float %in) { -; GCN-LABEL: v_exp2_fabs_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e64 v0, |v0| -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_fabs_f32: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_fabs_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_fabs_f32: ; R600: ; %bb.0: @@ -470,11 +1081,33 @@ define float @v_exp2_fabs_f32(float %in) { } define float @v_exp2_fneg_fabs_f32(float %in) { -; GCN-LABEL: v_exp2_fneg_fabs_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e64 v0, -|v0| -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_fneg_fabs_f32: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_sub_f32_e64 v0, v2, |v0| +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_fneg_fabs_f32: ; R600: ; %bb.0: @@ -492,11 +1125,33 @@ define float @v_exp2_fneg_fabs_f32(float %in) { } define float @v_exp2_fneg_f32(float %in) { -; GCN-LABEL: v_exp2_fneg_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_fneg_f32: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; GCN-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_fneg_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_fneg_f32: ; R600: ; %bb.0: @@ -573,11 +1228,33 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" } define float @v_exp2_f32_ninf(float %in) { -; GCN-LABEL: v_exp2_f32_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_ninf: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_ninf: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_ninf: ; R600: ; %bb.0: @@ -694,11 +1371,33 @@ define float @v_exp2_f32_daz(float %in) #0 { } define float @v_exp2_f32_nnan(float %in) { -; GCN-LABEL: v_exp2_f32_nnan: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_nnan: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_nnan: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_nnan: ; R600: ; %bb.0: @@ -734,11 +1433,33 @@ define float @v_exp2_f32_nnan_daz(float %in) #0 { } define float @v_exp2_f32_nnan_dynamic(float %in) #1 { -; GCN-LABEL: v_exp2_f32_nnan_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_nnan_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_nnan_dynamic: ; R600: ; %bb.0: @@ -774,11 +1495,33 @@ define float @v_exp2_f32_ninf_daz(float %in) #0 { } define float @v_exp2_f32_ninf_dynamic(float %in) #1 { -; GCN-LABEL: v_exp2_f32_ninf_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_ninf_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_ninf_dynamic: ; R600: ; %bb.0: @@ -794,11 +1537,33 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { } define float @v_exp2_f32_nnan_ninf(float %in) { -; GCN-LABEL: v_exp2_f32_nnan_ninf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_nnan_ninf: ; R600: ; %bb.0: @@ -834,11 +1599,33 @@ define float @v_exp2_f32_nnan_ninf_daz(float %in) #0 { } define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { -; GCN-LABEL: v_exp2_f32_nnan_ninf_dynamic: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: @@ -874,11 +1661,33 @@ define float @v_exp2_f32_fast_daz(float %in) #0 { } define float @v_exp2_f32_dynamic_mode(float %in) #1 { -; GCN-LABEL: v_exp2_f32_dynamic_mode: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_dynamic_mode: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_dynamic_mode: ; R600: ; %bb.0: @@ -894,11 +1703,26 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { } define float @v_exp2_f32_undef() { -; GCN-LABEL: v_exp2_f32_undef: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_exp_f32_e32 v0, s4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-SDAG-LABEL: v_exp2_f32_undef: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000 +; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_exp2_f32_undef: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GCN-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 +; GCN-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_undef: ; R600: ; %bb.0: @@ -972,8 +1796,16 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f32_from_fpext_math_f16: @@ -1031,7 +1863,15 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; GCN-SDAG-LABEL: v_exp2_f32_from_fpext_bf16: ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_from_fpext_bf16: @@ -1074,6 +1914,8 @@ define half @v_exp2_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f16: @@ -1121,6 +1963,8 @@ define half @v_exp2_fabs_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fabs_f16: @@ -1169,6 +2013,8 @@ define half @v_exp2_fneg_fabs_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16: @@ -1218,6 +2064,8 @@ define half @v_exp2_fneg_f16(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_f16: @@ -1266,6 +2114,8 @@ define half @v_exp2_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f16_fast: @@ -1316,6 +2166,10 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_v2f16: @@ -1384,6 +2238,10 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fabs_v2f16: @@ -1464,6 +2322,10 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_fabs_v2f16: @@ -1545,6 +2407,10 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_v2f16: @@ -1621,6 +2487,10 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_v2f16_fast: @@ -1692,6 +2562,12 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v3f16: @@ -1770,6 +2646,12 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_v3f16_afn: @@ -1849,3 +2731,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}}