Skip to content

Commit

Permalink
[AMDGPU] Fix no waitcnt produced between LDS DMA and ds_read on gfx10 (
Browse files Browse the repository at this point in the history
…#75245)

BUFFER_LOAD_DWORD_LDS was incorrectly touching vscnt instead of the
vmcnt. This is VMEM load and DS store, so it shall use vmcnt.
  • Loading branch information
rampitec committed Dec 13, 2023
1 parent a5ffabc commit c6ecbcb
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 19 deletions.
19 changes: 8 additions & 11 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
// FLAT instruction.
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
if (!ST->hasVscnt())
// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
// these should use VM_CNT.
if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
return VMEM_ACCESS;
if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
Expand Down Expand Up @@ -544,14 +546,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
}
}

// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
// can be accessed. A load from LDS to VMEM does not need a wait.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
return SIInstrInfo::isVALU(MI) &&
(SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
}

void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
Expand Down Expand Up @@ -703,7 +697,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(RegNo, T, CurrScore);
}
}
if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
if (Inst.mayStore() &&
(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
// written can be accessed. A load from LDS to VMEM does not need a wait.
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
}
}
Expand Down Expand Up @@ -1178,7 +1175,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
continue;
// No need to wait before load from VMEM to LDS.
if (mayWriteLDSThroughDMA(MI))
if (TII->mayWriteLDSThroughDMA(MI))
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::DS;
}

static bool isLDSDMA(const MachineInstr &MI) {
return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
}

bool isLDSDMA(uint16_t Opcode) {
return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
}

static bool isGWS(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::GWS;
}
Expand Down Expand Up @@ -667,6 +675,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
SIInstrFlags::IsAtomicNoRet);
}

static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
}

static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
Expand Down
11 changes: 3 additions & 8 deletions llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,10 @@ declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr

; FIXME: vmcnt(0) is too strong, it shall use vmcnt(2) before the first
; ds_read_b32 and vmcnt(0) before the second.
; FIXME: GFX10 does not get a waitcount at all.

; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
; GCN-COUNT-4: buffer_load_dword
; GFX9: s_waitcnt vmcnt(0)

; FIXME:
; GFX10-NOT: s_waitcnt

; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32

; FIXME:
Expand Down Expand Up @@ -49,9 +44,9 @@ main_body:
; GFX9: s_waitcnt vmcnt(0)
; GFX9-COUNT-2: ds_read_b32

; FIXME:
; GFX10-NOT: s_waitcnt
; FIXME: can be vmcnt(2)

; GFX10: s_waitcnt vmcnt(0)
; GFX10: ds_read_b32

; FIXME:
Expand Down

0 comments on commit c6ecbcb

Please sign in to comment.