Skip to content

Commit

Permalink
[AMDGPU] Do not release VGPRs if there may be pending scratch stores
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D153295
  • Loading branch information
jayfoad committed Jun 19, 2023
1 parent b3a08fa commit 4b6d41c
Show file tree
Hide file tree
Showing 28 changed files with 52 additions and 957 deletions.
69 changes: 52 additions & 17 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,20 @@ struct RegisterEncoding {
};

enum WaitEventType {
VMEM_ACCESS, // vector-memory read & write
VMEM_READ_ACCESS, // vector-memory read
VMEM_WRITE_ACCESS, // vector-memory write
LDS_ACCESS, // lds read & write
GDS_ACCESS, // gds read & write
SQ_MESSAGE, // send message
SMEM_ACCESS, // scalar-memory read & write
EXP_GPR_LOCK, // export holding on its data src
GDS_GPR_LOCK, // GDS holding on its data and addr src
EXP_POS_ACCESS, // write to export position
EXP_PARAM_ACCESS, // write to export parameter
VMW_GPR_LOCK, // vector-memory write holding on its data src
EXP_LDS_ACCESS, // read by ldsdir counting as export
VMEM_ACCESS, // vector-memory read & write
VMEM_READ_ACCESS, // vector-memory read
VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
LDS_ACCESS, // lds read & write
GDS_ACCESS, // gds read & write
SQ_MESSAGE, // send message
SMEM_ACCESS, // scalar-memory read & write
EXP_GPR_LOCK, // export holding on its data src
GDS_GPR_LOCK, // GDS holding on its data and addr src
EXP_POS_ACCESS, // write to export position
EXP_PARAM_ACCESS, // write to export parameter
VMW_GPR_LOCK, // vector-memory write holding on its data src
EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};

Expand All @@ -108,7 +109,7 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
(1 << VMEM_WRITE_ACCESS)};
(1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};

// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
Expand Down Expand Up @@ -458,13 +459,19 @@ class SIInsertWaitcnts : public MachineFunctionPass {
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
if (!ST->hasVscnt())
return VMEM_ACCESS;
if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst))
if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
return VMEM_READ_ACCESS;
}

bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
Expand Down Expand Up @@ -1036,11 +1043,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed.
// release all VGPRs before the stores have completed, but it is only safe to
// do this if there are no outstanding scratch stores.
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
ScoreBrackets.getScoreRange(VS_CNT) != 0)
ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
Expand Down Expand Up @@ -1396,6 +1405,32 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}

// This is a flat memory operation. Check to see if it has memory tokens for
// either scratch or FLAT.
bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
const MachineInstr &MI) const {
assert(TII->isFLAT(MI));

// SCRATCH instructions always access scratch.
if (TII->isFLATScratch(MI))
return true;

// GLOBAL instructions never access scratch.
if (TII->isFLATGlobal(MI))
return false;

// If there are no memory operands then conservatively assume the flat
// operation may access scratch.
if (MI.memoperands_empty())
return true;

// See if any memory operand specifies an address space that involves scratch.
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
unsigned AS = Memop->getAddrSpace();
return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
});
}

void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s0, s32, s0
; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 4, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
Expand Down Expand Up @@ -194,7 +193,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s0, s32, s0
; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 16, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
Expand Down Expand Up @@ -337,7 +335,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
; GFX11-NEXT: s_add_u32 s0, s32, s0
; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00
; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 32, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
Expand Down
12 changes: 0 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1471,7 +1471,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 {
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
store i32 %result, ptr %out
Expand Down Expand Up @@ -1551,7 +1550,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
Expand All @@ -1577,7 +1575,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
ret void
Expand Down Expand Up @@ -1637,7 +1634,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
Expand Down Expand Up @@ -1743,7 +1739,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v3
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
Expand Down Expand Up @@ -1824,7 +1819,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
Expand Down Expand Up @@ -2005,7 +1999,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
store i64 %result, ptr %out
Expand Down Expand Up @@ -2100,7 +2093,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
Expand Down Expand Up @@ -2128,7 +2120,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
ret void
Expand Down Expand Up @@ -2193,7 +2184,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
Expand Down Expand Up @@ -2311,7 +2301,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
Expand Down Expand Up @@ -2397,7 +2386,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,6 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
Expand Down Expand Up @@ -802,7 +801,6 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
Expand Down Expand Up @@ -916,7 +914,6 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
Expand Down Expand Up @@ -1016,7 +1013,6 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[2:3], s0
; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b)
store volatile double %result, ptr undef
Expand Down Expand Up @@ -126,7 +125,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7)
store volatile double %result, ptr undef
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
; GFX11-NEXT: s_endpgm
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
; GFX1100-NEXT: v_mov_b32_e32 v0, 0
; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
Expand Down Expand Up @@ -309,7 +308,6 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
Expand Down Expand Up @@ -585,7 +583,6 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%ld0 = load i32, ptr %lb
Expand Down Expand Up @@ -243,7 +242,6 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%ld0 = load i32, ptr %lb
Expand Down
Loading

0 comments on commit 4b6d41c

Please sign in to comment.