Skip to content

Commit

Permalink
[AMDGPU] Reimplement the GFX11 early release VGPRs optimization
Browse files Browse the repository at this point in the history
Implement this optimization in SIInsertWaitcnts, where we already have
information about whether there might be outstanding VMEM store
instructions. This has the following advantages:
- Correctly handles atomics-with-return.
- Correctly handles call instructions.
- Should be faster because it does not require running a separate pass.

Differential Revision: https://reviews.llvm.org/D153279
  • Loading branch information
jayfoad committed Jun 19, 2023
1 parent 632ccc5 commit eb74917
Show file tree
Hide file tree
Showing 13 changed files with 54 additions and 637 deletions.
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,6 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;

void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
extern char &AMDGPUReleaseVGPRsID;

void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;

Expand Down
156 changes: 0 additions & 156 deletions llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp

This file was deleted.

4 changes: 0 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHIPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeAMDGPUReleaseVGPRsPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
Expand Down Expand Up @@ -1431,9 +1430,6 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);

if (getOptLevel() > CodeGenOpt::Less)
addPass(&AMDGPUReleaseVGPRsID);

if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
addPass(&AMDGPUInsertDelayAluID);

Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankCombiner.cpp
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPUReleaseVGPRs.cpp
AMDGPURemoveIncompatibleFunctions.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool ForceEmitZeroWaitcnts;
bool ForceEmitWaitcnt[NUM_INST_CNTS];

// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;

public:
static char ID;

Expand Down Expand Up @@ -1032,6 +1036,15 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(allZeroWaitcnt());
}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed.
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
ScoreBrackets.getScoreRange(VS_CNT) != 0)
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
Expand Down Expand Up @@ -1930,5 +1943,14 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}

// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
// instructions.
for (MachineInstr *MI : ReleaseVGPRInsts) {
BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
Modified = true;
}
ReleaseVGPRInsts.clear();

return Modified;
}
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
Expand Down
80 changes: 26 additions & 54 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX900 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10PLUS %s

define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
; GFX6-LABEL: atomic_swap_i32_1d:
Expand Down Expand Up @@ -1059,32 +1059,18 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_cmpswap_i32_1d_no_return:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_cmpswap_i32_1d_no_return:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s0, s2
; GFX10PLUS-NEXT: s_mov_b32 s1, s3
; GFX10PLUS-NEXT: s_mov_b32 s2, s4
; GFX10PLUS-NEXT: s_mov_b32 s3, s5
; GFX10PLUS-NEXT: s_mov_b32 s4, s6
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
; GFX10PLUS-NEXT: s_endpgm
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret void
Expand Down Expand Up @@ -2760,32 +2746,18 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_cmpswap_i64_1d_no_return:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_cmpswap_i64_1d_no_return:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s0, s2
; GFX10PLUS-NEXT: s_mov_b32 s1, s3
; GFX10PLUS-NEXT: s_mov_b32 s2, s4
; GFX10PLUS-NEXT: s_mov_b32 s3, s5
; GFX10PLUS-NEXT: s_mov_b32 s4, s6
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX10PLUS-NEXT: s_endpgm
main_body:
%v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret void
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4382,7 +4382,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: scratch_store_b32 off, v31, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v32i32:
Expand Down Expand Up @@ -4552,7 +4551,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: scratch_store_b32 off, v32, s4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_v32i32_i32:
Expand Down Expand Up @@ -4926,7 +4924,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm

entry:
Expand Down Expand Up @@ -517,7 +516,6 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,6 @@
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: Release VGPRs
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
; GCN-O2-NEXT: Branch relaxation pass
; GCN-O2-NEXT: Register Usage Information Collector Pass
Expand Down Expand Up @@ -1350,7 +1349,6 @@
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: Release VGPRs
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
; GCN-O3-NEXT: Branch relaxation pass
; GCN-O3-NEXT: Register Usage Information Collector Pass
Expand Down

0 comments on commit eb74917

Please sign in to comment.