diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2751c6b4ea9987..c4baabcd9232b5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3454,6 +3454,19 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + // If src2 is also a literal constant then we have to choose which one to + // fold. In general it is better to choose madak so that the other literal + // can be materialized in an sgpr instead of a vgpr: + // s_mov_b32 s0, literal + // v_madak_f32 v0, s0, v0, literal + // Instead of: + // v_mov_b32 v1, literal + // v_madmk_f32 v0, v0, literal, v1 + MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg()); + if (Def && Def->isMoveImmediate() && + !isInlineConstant(Def->getOperand(1))) + return false; + unsigned NewOpc = IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll index 9fb0cab068d286..fe649d43330417 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -7,7 +7,6 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0x3ca3d70a ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D @@ -37,33 +36,34 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GFX10-NEXT: v_fma_f32 v1, v1, v5, s28 ; GFX10-NEXT: v_max_f32_e64 v6, s0, s0 clamp ; GFX10-NEXT: v_add_f32_e64 v5, s29, -1.0 -; GFX10-NEXT: v_sub_f32_e32 v9, s0, v1 -; GFX10-NEXT: v_fma_f32 v8, -s2, v6, s6 +; GFX10-NEXT: v_sub_f32_e32 v8, s0, v1 +; GFX10-NEXT: v_fma_f32 v7, -s2, v6, s6 ; GFX10-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GFX10-NEXT: v_mad_f32 v11, s2, v6, v2 -; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v9 -; GFX10-NEXT: v_fmac_f32_e32 v11, v8, v6 +; GFX10-NEXT: v_mad_f32 v10, s2, v6, v2 +; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a +; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v8 +; GFX10-NEXT: v_fmac_f32_e32 v10, v7, v6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e32 v10, s10, v0 +; GFX10-NEXT: v_mul_f32_e32 v9, s10, v0 ; GFX10-NEXT: v_fma_f32 v0, -v0, s10, s14 -; GFX10-NEXT: v_mul_f32_e32 v9, s18, v2 +; GFX10-NEXT: v_mul_f32_e32 v8, s18, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, s22, v3 -; GFX10-NEXT: v_fmac_f32_e32 v10, v0, v6 +; GFX10-NEXT: v_fmac_f32_e32 v9, v0, v6 ; GFX10-NEXT: v_sub_f32_e32 v0, v1, v5 -; GFX10-NEXT: v_mul_f32_e32 v1, v9, v6 -; GFX10-NEXT: v_mul_f32_e32 v8, v6, v3 -; GFX10-NEXT: v_fma_f32 v3, -v6, v3, v10 +; GFX10-NEXT: v_mul_f32_e32 v1, v8, v6 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v3 +; GFX10-NEXT: v_fma_f32 v3, -v6, v3, v9 ; GFX10-NEXT: v_fmac_f32_e32 v5, v0, v6 ; GFX10-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GFX10-NEXT: v_fmac_f32_e32 v8, v3, v6 +; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v6 ; GFX10-NEXT: v_fmac_f32_e32 v1, v0, v6 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v6 -; GFX10-NEXT: v_fmamk_f32 v4, v5, 0x3c23d70a, v7 +; GFX10-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a ; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, v8, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v7, v4 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v0 ; GFX10-NEXT: v_max_f32_e32 v0, 0, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -71,7 +71,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GFX11-LABEL: _amdgpu_ps_main: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0x3ca3d70a +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D @@ -96,40 +96,43 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GFX11-NEXT: s_buffer_load_b128 s[20:23], s[0:3], 0x70 ; GFX11-NEXT: v_fma_f32 v1, v1, v5, s28 ; GFX11-NEXT: v_max_f32_e64 v6, s0, s0 clamp -; GFX11-NEXT: v_add_f32_e64 v5, s29, -1.0 ; GFX11-NEXT: s_buffer_load_b128 s[24:27], s[0:3], 0x10 +; GFX11-NEXT: v_add_f32_e64 v5, s29, -1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_f32_e32 v9, s0, v1 -; GFX11-NEXT: v_fma_f32 v8, -s2, v6, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_sub_f32_e32 v8, s0, v1 +; GFX11-NEXT: v_fma_f32 v7, -s2, v6, s6 +; GFX11-NEXT: v_fma_f32 v10, s2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GFX11-NEXT: v_fma_f32 v11, s2, v6, v2 +; GFX11-NEXT: s_mov_b32 s0, 0x3c23d70a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e32 v10, s10, v0 +; GFX11-NEXT: v_mul_f32_e32 v9, s10, v0 ; GFX11-NEXT: v_fma_f32 v0, -v0, s10, s14 -; GFX11-NEXT: v_fmac_f32_e32 v1, v6, v9 -; GFX11-NEXT: v_mul_f32_e32 v9, s18, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, s22, v3 +; GFX11-NEXT: v_dual_fmac_f32 v1, v6, v8 :: v_dual_mul_f32 v8, s18, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v9, v0, v6 +; GFX11-NEXT: v_dual_fmac_f32 v10, v7, v6 :: v_dual_mul_f32 v7, v6, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fmac_f32_e32 v10, v0, v6 ; GFX11-NEXT: v_sub_f32_e32 v0, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v3, -v6, v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v6 ; GFX11-NEXT: v_fmac_f32_e32 v5, v0, v6 -; GFX11-NEXT: v_mul_f32_e32 v3, s22, v3 -; GFX11-NEXT: v_dual_fmac_f32 v11, v8, v6 :: v_dual_mul_f32 v8, v6, v3 -; GFX11-NEXT: v_mul_f32_e32 v1, v9, v6 -; GFX11-NEXT: v_fma_f32 v3, -v6, v3, v10 +; GFX11-NEXT: v_mul_f32_e32 v1, v8, v6 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v3, v4, v6 :: v_dual_fmaak_f32 v4, s0, v5, 0x3ca3d70a ; GFX11-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_fmac_f32_e32 v1, v0, v6 ; GFX11-NEXT: v_mul_f32_e32 v0, v2, v6 -; GFX11-NEXT: v_fmac_f32_e32 v8, v3, v6 -; GFX11-NEXT: v_dual_mul_f32 v3, v4, v6 :: v_dual_fmamk_f32 v4, v5, 0x3c23d70a, v7 -; GFX11-NEXT: v_dual_mul_f32 v1, v3, v1 :: v_dual_mul_f32 v2, v8, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v2, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, 0, v1 ; GFX11-NEXT: ; return to shader part epilog .entry: