diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8415a3d77d3bc..efc38b63647b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -492,9 +492,6 @@ class SIInsertWaitcnts : public MachineFunctionPass { MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const; - - // Transform a soft waitcnt into a normal one. - bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; }; } // end anonymous namespace @@ -874,15 +871,6 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, return true; } -bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { - unsigned Opcode = Waitcnt->getOpcode(); - if (!SIInstrInfo::isSoftWaitcnt(Opcode)) - return false; - - Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); - return true; -} - /// Combine consecutive waitcnt instructions that precede \p It and follow /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added /// by previous passes. Currently this pass conservatively assumes that these @@ -940,7 +928,6 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (WaitcntInstr) { Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, AMDGPU::encodeWaitcnt(IV, Wait)); - Modified |= promoteSoftWaitCnt(WaitcntInstr); ScoreBrackets.applyWaitcnt(Wait); Wait.VmCnt = ~0u; @@ -959,7 +946,6 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (WaitcntVsCntInstr) { Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16, Wait.VsCnt); - Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); ScoreBrackets.applyWaitcnt(Wait); Wait.VsCnt = ~0u; @@ -1320,7 +1306,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_soft)).addImm(Enc); Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; @@ -1331,9 +1317,10 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); - [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.VsCnt); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.VsCnt); Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; @@ -1935,6 +1922,17 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } while (Repeat); + // Promote all soft waitcnts. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB.instrs()) { + if (SIInstrInfo::isSoftWaitcnt(MI.getOpcode())) { + MI.setDesc( + TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(MI.getOpcode()))); + Modified = true; + } + } + } + if (ST->hasScalarStores()) { SmallVector EndPgmBlocks; bool HaveScalarStores = false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70ef1fff274a4..a91780ce89762 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8784,9 +8784,6 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { - if (SIInstrInfo::isSoftWaitcnt(Opcode)) - Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); - unsigned Gen = subtargetEncodingFamily(ST); if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index 5b77024852056..caf67da9cd2d9 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -26,7 +26,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float ; GCN: successors: ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN: S_WAITCNT_soft 3952 + ; GCN: S_WAITCNT 3952 ; GCN: bb.3: entry: %cc = icmp sgt i32 %a, 0 @@ -63,7 +63,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a, ; GCN: successors: ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN: S_WAITCNT_soft 3952 + ; GCN: S_WAITCNT 3952 ; GCN: bb.5: entry: %cc = icmp sgt i32 %a, 0 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir new file mode 100644 index 0000000000000..c32161dd1246c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -march=amdgcn -start-before=si-insert-waitcnts -mcpu=gfx1030 -verify-machineinstrs -o - %s | FileCheck %s + +--- | + define amdgpu_ps <4 x float> @test_waterfall_multi_begin(ptr addrspace(4) inreg %in, ptr addrspace(4) inreg %s_in, i32 %idx1, i32 %idx2, i32 %s_idx, i32 %s_idx2) #0 { + ; CHECK-LABEL: test_waterfall_multi_begin: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; CHECK-NEXT: s_mov_b32 s4, exec_lo + ; CHECK-NEXT: s_wqm_b32 exec_lo, exec_lo + ; CHECK-NEXT: v_mov_b32_e32 v7, v2 + ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 + ; CHECK-NEXT: v_mov_b32_e32 v6, v1 + ; CHECK-NEXT: v_mov_b32_e32 v5, v0 + ; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v7 + ; CHECK-NEXT: v_lshlrev_b64 v[2:3], 4, v[3:4] + ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 5, v[7:8] + ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 + ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo + ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, s2, v2 + ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo + ; CHECK-NEXT: s_clause 0x1 + ; CHECK-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 + ; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off + ; CHECK-NEXT: global_load_dwordx4 v[15:18], v[2:3], off + ; CHECK-NEXT: s_mov_b32 s0, exec_lo + ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: v_readfirstlane_b32 s1, v5 + ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 + ; CHECK-NEXT: v_cmp_eq_u32_e64 s1, s1, v5 + ; CHECK-NEXT: v_cmp_eq_u32_e64 s2, s2, v6 + ; CHECK-NEXT: s_and_b32 s1, s1, s2 + ; CHECK-NEXT: s_and_saveexec_b32 s1, s1 + ; CHECK-NEXT: s_waitcnt vmcnt(0) + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: v_readfirstlane_b32 s8, v7 + ; CHECK-NEXT: v_readfirstlane_b32 s9, v8 + ; CHECK-NEXT: v_readfirstlane_b32 s10, v9 + ; CHECK-NEXT: v_readfirstlane_b32 s11, v10 + ; CHECK-NEXT: v_readfirstlane_b32 s12, v11 + ; CHECK-NEXT: v_readfirstlane_b32 s13, v12 + ; CHECK-NEXT: v_readfirstlane_b32 s14, v13 + ; CHECK-NEXT: v_readfirstlane_b32 s15, v14 + ; CHECK-NEXT: v_readfirstlane_b32 s16, v15 + ; CHECK-NEXT: v_readfirstlane_b32 s17, v16 + ; CHECK-NEXT: v_readfirstlane_b32 s18, v17 + ; CHECK-NEXT: v_readfirstlane_b32 s19, v18 + ; CHECK-NEXT: v_mov_b32_e32 v1, v0 + ; CHECK-NEXT: ; implicit-def: $vgpr5 + ; CHECK-NEXT: ; implicit-def: $vgpr6 + ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 + ; CHECK-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 + ; CHECK-NEXT: image_sample v[0:3], v[0:1], s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_2D + ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s1 + ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 + ; CHECK-NEXT: ; %bb.2: + ; CHECK-NEXT: s_mov_b32 exec_lo, s0 + ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, s4 + ; CHECK-NEXT: s_waitcnt vmcnt(0) + ; CHECK-NEXT: ; return to shader part epilog + ret <4 x float> poison + } + + attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="gfx1030" "uniform-work-group-size"="false" } +... +--- +name: test_waterfall_multi_begin +tracksRegLiveness: true +machineFunctionInfo: + returnsVoid: false + psInputAddr: 15 + psInputEnable: 15 +body: | + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + $sgpr4 = S_MOV_B32 $exec_lo + $exec_lo = S_WQM_B32 $exec_lo, implicit-def $scc + $vgpr7 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec + renamable $vgpr4 = V_ASHRREV_I32_e32 31, $vgpr3, implicit $exec + $vgpr6 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $vgpr0, implicit $exec, implicit $exec + renamable $vgpr8 = V_ASHRREV_I32_e32 31, $vgpr7, implicit $exec + renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 4, killed $vgpr3_vgpr4, implicit $exec + renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 5, killed $vgpr7_vgpr8, implicit $exec + renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr0, killed $vgpr0, 0, implicit $exec + renamable $vgpr1 = V_ADDC_U32_e32 killed $sgpr1, killed $vgpr1, implicit-def dead $vcc, implicit killed $vcc, implicit $exec + renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr2, killed $vgpr2, 0, implicit $exec + renamable $vgpr3 = V_ADDC_U32_e32 killed $sgpr3, killed $vgpr3, implicit-def dead $vcc, implicit killed $vcc, implicit $exec + renamable $vgpr11_vgpr12_vgpr13_vgpr14 = GLOBAL_LOAD_DWORDX4 renamable $vgpr0_vgpr1, 16, 0, implicit $exec + renamable $vgpr7_vgpr8_vgpr9_vgpr10 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + renamable $vgpr15_vgpr16_vgpr17_vgpr18 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec + renamable $sgpr0 = S_MOV_B32 $exec_lo + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $sgpr4, $vgpr5, $vgpr6, $vgpr15_vgpr16_vgpr17_vgpr18:0x00000000000000FF, $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14:0x000000000000FFFF + + renamable $sgpr1 = V_READFIRSTLANE_B32 $vgpr5, implicit $exec + renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr6, implicit $exec + renamable $sgpr1 = V_CMP_EQ_U32_e64 killed $sgpr1, killed $vgpr5, implicit $exec + renamable $sgpr2 = V_CMP_EQ_U32_e64 killed $sgpr2, killed $vgpr6, implicit $exec + renamable $sgpr1 = S_AND_B32 killed renamable $sgpr1, killed renamable $sgpr2, implicit-def dead $scc + renamable $sgpr1 = S_AND_SAVEEXEC_B32 killed renamable $sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec + renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr7, implicit $exec + renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr8, implicit $exec + renamable $sgpr10 = V_READFIRSTLANE_B32 killed $vgpr9, implicit $exec + renamable $sgpr11 = V_READFIRSTLANE_B32 killed $vgpr10, implicit $exec + renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec + renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec + renamable $sgpr14 = V_READFIRSTLANE_B32 killed $vgpr13, implicit $exec + renamable $sgpr15 = V_READFIRSTLANE_B32 killed $vgpr14, implicit $exec + renamable $sgpr16 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + renamable $sgpr17 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec + renamable $sgpr18 = V_READFIRSTLANE_B32 killed $vgpr17, implicit $exec + renamable $sgpr19 = V_READFIRSTLANE_B32 killed $vgpr18, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec + renamable $vgpr5 = IMPLICIT_DEF + renamable $vgpr6 = IMPLICIT_DEF + renamable $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF + renamable $vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 killed renamable $vgpr0_vgpr1, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) + $exec_lo = S_XOR_B32 $exec_lo, killed renamable $sgpr1, implicit-def dead $scc + S_CBRANCH_EXECNZ %bb.1, implicit $exec + + bb.2: + liveins: $sgpr0, $sgpr4, $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF + + $exec_lo = S_MOV_B32 killed renamable $sgpr0 + $exec_lo = S_AND_B32 $exec_lo, killed renamable $sgpr4, implicit-def $scc + SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3 +...