diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b5c746b81b91e..31617eef562d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4657,6 +4657,10 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, return true; } +static const unsigned SPDenormModeBitField = + AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions // to enable denorm mode. When 'Enable' is false, disable denorm mode. static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, @@ -4675,11 +4679,6 @@ static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, .addImm(NewDenormModeValue); } else { - // Select FP32 bit field in mode register. - unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | - (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) .addImm(SPDenormMode) .addImm(SPDenormModeBitField); @@ -4723,10 +4722,21 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, .setMIFlags(Flags); auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); - // FIXME: Doesn't correctly model the FP mode switch, and the FP operations - // aren't modeled as reading it. - if (Mode.FP32Denormals == DenormalMode::getPreserveSign()) + const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); + const bool HasDynamicDenormals = + (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || + (Mode.FP32Denormals.Output == DenormalMode::Dynamic); + + Register SavedSPDenormMode; + if (!PreservesDenormals) { + if (HasDynamicDenormals) { + SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + B.buildInstr(AMDGPU::S_GETREG_B32) + .addDef(SavedSPDenormMode) + .addImm(SPDenormModeBitField); + } toggleSPDenormMode(true, B, ST, Mode); + } auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); @@ -4735,10 +4745,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); - // FIXME: This mishandles dynamic denormal mode. We need to query the - // current mode and restore the original. - if (Mode.FP32Denormals == DenormalMode::getPreserveSign()) - toggleSPDenormMode(false, B, ST, Mode); + if (!PreservesDenormals) { + if (HasDynamicDenormals) { + assert(SavedSPDenormMode); + B.buildInstr(AMDGPU::S_SETREG_B32) + .addReg(SavedSPDenormMode) + .addImm(SPDenormModeBitField); + } else + toggleSPDenormMode(false, B, ST, Mode); + } auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) .addUse(Fma4.getReg(0)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 998904bf08820..e95aa5310db42 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9589,28 +9589,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const SIMachineFunctionInfo *Info = MF.getInfo(); const DenormalMode DenormMode = Info->getMode().FP32Denormals; - const bool HasFP32Denormals = DenormMode != DenormalMode::getPreserveSign(); + const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); + const bool HasDynamicDenormals = + (DenormMode.Input == DenormalMode::Dynamic) || + (DenormMode.Output == DenormalMode::Dynamic); - if (!HasFP32Denormals) { + SDValue SavedDenormMode; + + if (!PreservesDenormals) { // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV // lowering. The chain dependence is insufficient, and we need glue. We do // not need the glue variants in a strictfp function. SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Glue = DAG.getEntryNode(); + if (HasDynamicDenormals) { + SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, + DAG.getVTList(MVT::i32, MVT::Glue), + {BitField, Glue}); + SavedDenormMode = SDValue(GetReg, 0); + + Glue = DAG.getMergeValues( + {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); + } + SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); - EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue).getNode(); + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, + EnableDenormValue) + .getNode(); } else { const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); - EnableDenorm = - DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, - {EnableDenormValue, BitField, DAG.getEntryNode()}); + EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, + {EnableDenormValue, BitField, Glue}); } SDValue Ops[3] = { @@ -9640,12 +9656,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3, Flags); - if (!HasFP32Denormals) { - // FIXME: This mishandles dynamic denormal mode. We need to query the - // current mode and restore the original. - + if (!PreservesDenormals) { SDNode *DisableDenorm; - if (Subtarget->hasDenormModeInst()) { + if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = getSPDenormModeValue( FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); @@ -9653,8 +9666,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2)).getNode(); } else { + assert(HasDynamicDenormals == (bool)SavedDenormMode); const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + HasDynamicDenormals + ? SavedDenormMode + : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); DisableDenorm = DAG.getMachineNode( AMDGPU::S_SETREG_B32, SL, MVT::Other, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 55b8dd6892097..9da75b093fc9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -209,12 +209,15 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -225,12 +228,15 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -240,13 +246,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -256,13 +265,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -272,13 +284,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -288,53 +303,105 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic_denorm: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_denorm: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic_denorm: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_denorm: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b ret float %fdiv } @@ -2547,12 +2614,15 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -2563,12 +2633,15 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -2578,13 +2651,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -2594,13 +2670,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -2610,13 +2689,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -2626,53 +2708,105 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -2996,12 +3130,15 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3013,12 +3150,15 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3029,13 +3169,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3046,13 +3189,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3063,13 +3209,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX89-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -3080,56 +3229,111 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 { ; GFX89-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v2, v1, v2 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_neglhs_f32_dynamic: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX10-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX10-NEXT: v_fma_f32 v2, -v3, v5, v2 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, v2 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_neglhs_f32_dynamic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v2 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v2, -v3, v5, v2 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, s4, v1, v1, v2 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, v2 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, null, v1, v1, v2 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v2, v1, v2 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_neglhs_f32_dynamic: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -3330,12 +3534,15 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3347,12 +3554,15 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3363,13 +3573,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3380,13 +3593,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3397,13 +3613,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX89-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -3414,56 +3633,111 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 { ; GFX89-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v2, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_negrhs_f32_dynamic: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX10-NEXT: v_div_scale_f32 v3, s4, v2, v2, v0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX10-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX10-NEXT: v_fma_f32 v2, -v3, v5, v2 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_negrhs_f32_dynamic: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_negrhs_f32_dynamic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_scale_f32 v3, null, v2, v2, v0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v2, -v3, v5, v2 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_negrhs_f32_dynamic: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, s4, v2, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_negrhs_f32_dynamic: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_negrhs_f32_dynamic: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, null, v2, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v2, v0 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_negrhs_f32_dynamic: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -3550,12 +3824,15 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3567,12 +3844,15 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3583,13 +3863,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3600,13 +3883,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3617,13 +3903,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX89-IEEE-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -3634,53 +3923,105 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX89-FLUSH-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, 0x4640e400, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_constrhs0_dynamic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, 0x4640e400, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -3862,12 +4203,15 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3879,12 +4223,15 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3895,13 +4242,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX6-IEEE-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3912,13 +4262,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX6-FLUSH-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3929,13 +4282,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX89-IEEE-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -3946,53 +4302,105 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX89-FLUSH-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_constlhs0_dynamic: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_constlhs0_dynamic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -4173,12 +4581,15 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -4189,12 +4600,15 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -4204,13 +4618,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -4220,13 +4637,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-FLUSH-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -4236,13 +4656,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -4252,53 +4675,105 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic_nodenorm_x: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_x: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic_nodenorm_x: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_x: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_x: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_x: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -4495,12 +4970,15 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -4511,12 +4989,15 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -4526,13 +5007,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -4542,13 +5026,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-FLUSH-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -4558,13 +5045,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -4574,53 +5064,105 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic_nodenorm_y: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_y: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: s_denorm_mode 15 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic_nodenorm_y: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_y: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_y: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_denorm_mode 15 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_y: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_y: ; EG: ; %bb.0: ; EG-NEXT: CF_END diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 415f9087a372d..e9643aa65402b 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -2416,13 +2416,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -2432,13 +2435,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -2448,13 +2454,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2464,13 +2473,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2479,14 +2491,17 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2495,15 +2510,18 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2708,13 +2726,16 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -2724,13 +2745,16 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -2740,13 +2764,16 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2756,13 +2783,16 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2771,14 +2801,17 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2787,15 +2820,18 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3257,13 +3293,16 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3274,13 +3313,16 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3291,13 +3333,16 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3308,13 +3353,16 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 ; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 ; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3324,14 +3372,17 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 ; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 ; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3341,15 +3392,18 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 ; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 ; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 @@ -3813,13 +3867,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -3829,13 +3886,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -3845,13 +3905,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3861,13 +3924,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3876,14 +3942,17 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3892,15 +3961,18 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4361,13 +4433,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4378,13 +4453,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4395,13 +4473,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4412,13 +4493,16 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 ; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 ; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4428,14 +4512,17 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 ; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 ; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4445,15 +4532,18 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 ; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 ; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 @@ -4919,13 +5009,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -4935,13 +5028,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -4951,13 +5047,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4967,13 +5066,16 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4982,14 +5084,17 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, -v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4998,15 +5103,18 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, -v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5460,13 +5568,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -5476,13 +5587,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -5492,13 +5606,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5508,13 +5625,16 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5523,14 +5643,17 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, -v1, -v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5539,15 +5662,18 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, -v1, -v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5986,13 +6112,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -6003,13 +6132,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -6020,13 +6152,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6037,13 +6172,16 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX8-NEXT: v_div_fixup_f32 v0, v1, s6, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6052,14 +6190,17 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6068,15 +6209,18 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 +; GFX11-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6474,13 +6618,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -6491,13 +6638,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -6508,13 +6658,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6525,13 +6678,16 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 ; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, s6 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6540,14 +6696,17 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6556,15 +6715,18 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 +; GFX11-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7005,13 +7167,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -7021,13 +7186,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -7037,13 +7205,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7053,13 +7224,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -7068,14 +7242,17 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7084,15 +7261,18 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7540,13 +7720,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] @@ -7556,13 +7739,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX6-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] @@ -7572,13 +7758,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7588,13 +7777,16 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX8-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -7603,14 +7795,17 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7619,15 +7814,18 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31]