-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already #72830
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesSee #72829 to precommit test changes. Original patch by @jmmartinez : #65735 Patch is 3.52 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72830.diff 67 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index bf65be3fe9035e7..c8ce1903d31537c 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
switch (MCI.getOpcode()) {
case AMDGPU::S_WAITCNT:
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
default:
return 0;
case AMDGPU::S_WAITCNT: // This instruction
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..7048aee3099d166 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -292,6 +292,13 @@ class WaitcntBrackets {
VgprVmemTypes[GprNo] = 0;
}
+ void setNonKernelFunctionInitialState() {
+ for (InstCounterType Counter : inst_counter_types()) {
+ setScoreUB(Counter, getWaitCountMax(Counter));
+ PendingEvents |= WaitEventMaskForInst[Counter];
+ }
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
@@ -364,7 +371,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
const MachineRegisterInfo *MRI = nullptr;
AMDGPU::IsaVersion IV;
- DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
@@ -477,7 +483,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
+ MachineInstr *OldWaitcntInstr) const;
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
@@ -486,6 +492,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
MachineBasicBlock::instr_iterator It) const;
+ bool updateWaitcntIfSoft(MachineInstr *Waitcnt) const;
};
} // end anonymous namespace
@@ -870,6 +877,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}
+bool SIInsertWaitcnts::updateWaitcntIfSoft(MachineInstr *Waitcnt) const {
+ unsigned Opcode = Waitcnt->getOpcode();
+ if (!SIInstrInfo::isSoftWaitcnt(Opcode))
+ return false;
+
+ Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
+ return true;
+}
+
/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
@@ -886,18 +902,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;
- if (II.getOpcode() == AMDGPU::S_WAITCNT) {
+ unsigned Opcode = II.getOpcode();
+ bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt(Opcode);
+
+ if (SIInstrInfo::isWaitcnt(Opcode)) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned IEnc = II.getOperand(0).getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- Wait = Wait.combined(OldWait);
- }
+ unsigned IEnc = II.getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ if (CanFullyDiscardWaitcntSequence)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
// Merge consecutive waitcnt of the same type by erasing multiples.
- if (!WaitcntInstr) {
+ if (!WaitcntInstr &&
+ (Wait.hasWaitExceptVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntInstr = &II;
} else {
II.eraseFromParent();
@@ -905,15 +925,17 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
}
} else {
- assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned OldVSCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
- }
- if (!WaitcntVsCntInstr) {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (CanFullyDiscardWaitcntSequence)
+ ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+
+ if (!WaitcntVsCntInstr &&
+ (Wait.hasWaitVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntVsCntInstr = &II;
} else {
II.eraseFromParent();
@@ -924,48 +946,38 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
// Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
- if (Wait.hasWaitExceptVsCnt()) {
- Modified |=
- updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
- AMDGPU::encodeWaitcnt(IV, Wait));
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitcntInstr
- << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntInstr << '\n');
+ Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
+ AMDGPU::encodeWaitcnt(IV, Wait));
+ Modified |= updateWaitcntIfSoft(WaitcntInstr);
- } else {
- WaitcntInstr->eraseFromParent();
- Modified = true;
- }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs()
+ << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
}
if (WaitcntVsCntInstr) {
- if (Wait.hasWaitVsCnt()) {
- assert(ST->hasVscnt());
- Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
- AMDGPU::OpName::simm16, Wait.VsCnt);
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VsCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: "
- << *WaitcntVsCntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
- } else {
- WaitcntVsCntInstr->eraseFromParent();
- Modified = true;
- }
+ Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
+ AMDGPU::OpName::simm16, Wait.VsCnt);
+ Modified |= updateWaitcntIfSoft(WaitcntVsCntInstr);
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntVsCntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
}
return Modified;
@@ -1284,7 +1296,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
+ MachineInstr *OldWaitcntInstr) const {
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
@@ -1317,7 +1329,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1331,7 +1342,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.VsCnt);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1574,9 +1584,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
static bool isWaitInstr(MachineInstr &Inst) {
- return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
- (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- Inst.getOperand(0).isReg() &&
+ auto Opcode = Inst.getOpcode();
+ return SIInstrInfo::isWaitcnt(Opcode) ||
+ (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
}
@@ -1845,7 +1855,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
- TrackedWaitcntSet.clear();
BlockInfos.clear();
bool Modified = false;
@@ -1863,6 +1872,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ auto NonKernelInitialState =
+ std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+ NonKernelInitialState->setNonKernelFunctionInitialState();
+ BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
Modified = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c4baabcd9232b56..7ccaea823a2295c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8774,6 +8774,11 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+
+ // FIXME: move to the right place
+ if (SIInstrInfo::isSoftWaitcnt(Opcode))
+ Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+
unsigned Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index de2820e5c013ee3..fb11d30dbca6dca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -859,6 +859,31 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
}
+ static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
+ if (isWaitcnt(Opcode))
+ return AMDGPU::S_WAITCNT;
+
+ if (isWaitcntVsCnt(Opcode))
+ return AMDGPU::S_WAITCNT_VSCNT;
+
+ llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
+ }
+
+ static bool isWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
+ }
+
+ static bool isWaitcntVsCnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
+ // soft waitcnt instructions can be relaxed/optimized out by SIInsertWaitcnts
+ static bool isSoftWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_soft ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bc48f7b76c6d787..10ec54d3317fdf1 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
@@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
if (VSCnt) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
Changed = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 90056e6ca281e78..83b325a148a168a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1464,6 +1464,21 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
+
+// "_soft" waitcnts are waitcnts that are either relaxed into their non-soft
+// counterpart, or completely removed.
+//
+// These are inserted by to resolve memory dependencies by the memory
+// legalizer and later optimized by SIInsertWaitcnts
+// For example, a S_WAITCNT_soft 0 can be completely removed on a function
+// that doesn't access memory.
+def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
+def S_WAITCNT_VSCNT_soft : SOPP_Pseudo<"s_soft_waitcnt_vscnt", (ins SReg_32:$sdst, s16imm:$simm16), "$sdst, $simm16"> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_sdst = 1;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index feb65a5210d59d2..25cee87244975e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -22,7 +22,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -38,7 +37,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -53,7 +51,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -67,8 +64,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
@@ -83,8 +79,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
@@ -107,7 +101,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -123,7 +116,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -138,7 +130,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -152,8 +143,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drive-by nits.
@@ -8774,6 +8774,11 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { | |||
} | |||
|
|||
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { | |||
|
|||
// FIXME: move to the right place |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where's the right place? :) This comment should be a bit more specific
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know (just picked up the patch) - looks like the right place to me
unsigned Opcode = II.getOpcode(); | ||
bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt(Opcode); | ||
|
||
if (SIInstrInfo::isWaitcnt(Opcode)) { | ||
// Conservatively update required wait if this waitcnt was added in an | ||
// earlier pass. In this case it will not exist in the tracked waitcnt | ||
// set. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should update this comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What should it say instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, there's no more tracked waitcnt set, so we should rephrase it in terms of soft waitcnts instead.
Something along the lines of "Update required wait. If this waitcnt was added in an earlier pass, but is no longer needed, it may be removed."
I had a bunch of outstanding comments on that PR. |
Aren't they all addressed, with the exception of the added |
I don't know because I can't easily see what has changed. I'll try to take a fresh look soon. |
@@ -45,6 +45,7 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 { | |||
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |||
; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1] | |||
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |||
; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This extra waitcnt is due to setNonKernelFunctionInitialState
Without this patch the counters start at
*** Block0 ***
VM_CNT(0):
LGKM_CNT(0):
EXP_CNT(0):
VS_CNT(0):
With the patch:
*** Block0 ***
VM_CNT(63):
LGKM_CNT(63):
EXP_CNT(7):
VS_CNT(63):
I didn't follow the full discussion around the patch so I don't know the context behind this change. I will try to dive deeper this week to provide more meaningful feedback.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am also confused by setNonKernelFunctionInitialState
. It does not seem like it should be necessary for this patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without it there's quite a few more changes, e.g.
- MSG_DEALLOC_VGPRS is back in
llvm.amdgcn.set.inactive.chain.arg.ll
atomicrmw-expand.ll
has a few missings_waitcnt_vscnt null, 0x0
which I'm not sure if it's good or bad
Given that the PSDB passes with the changes I assume they're here for a reason, but Juan cannot tell us why right now so I don't know what the reasoning is
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the effect of setNonKernelFunctionInitialState
is, roughly, to say that all the wait counters are in an unknown state, instead of known to be 0. It seems very odd to do this immediately after emitting s_waitcnt 0
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, thinking about this some more, it is probably a good idea to set vscnt (only) to "unknown", since we do not insert a wait for vscnt on function entry. I guess that would undo most or all of the changes you saw.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this fine for now then?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this patch I would prefer that setNonKernelFunctionInitialState
only sets vscnt to unknown, and leaves the other counters as known to be zero. Does that change affect the tests?
Also I think it might need to do Brackets.setPendingEvent(SCRATCH_WRITE_ACCESS)
(like in #73122) to indicate that there might be outstanding scratch writes. Does that affect the tests? I'm not sure if it also needs to set the other pending event bits too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Or if you prefer we could try to get #73122 finished first, and then revisit the current patch.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this patch I would prefer that setNonKernelFunctionInitialState only sets vscnt to unknown, and leaves the other counters as known to be zero. Does that change affect the tests?
With
setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
PendingEvents |= WaitEventMaskForInst[VS_CNT];
Only CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
changes, everything else stays the same
I have no strong preference on whether to finish this first or the other patch - I just picked it up and I'm still learning about InsertWaitCnt myself. Though, I think if this patch has a lot more improvements than regressions we should land it first and add a TODO for the remaining bad cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hello!
In an initial version of the patch, before we introduced de soft-waitcnts, setting the counters to 0 in the initial state would end up removing the "s_waitcnt 0" placed at non-kernel function entry. This is not needed anymore since we make the distinction between soft/strong waitcnts. However, we still have to do it for vscnt as jay said.
2ca1866
to
2753889
Compare
Gentle ping, it'd be nice to land this before the end of the year :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any interaction with #73122, given the change to no longer auto-adding 0 on entry?
484bbb3
to
9eb1b59
Compare
gentle ping |
if (!SIInstrInfo::isSoftWaitcnt(Opcode)) | ||
return false; | ||
|
||
Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this here as well as in pseudoToMCOpcode
?
Note that SIInsertWaitcnts is iterative, so it may process the same block several times. If that happens then by the time the it runs the second time on a block, all remaining soft waitcnts will have been converted to hard ones by the first pass. Is that desirable for some reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this here as well as in pseudoToMCOpcode?
So soft waitcnts can be lowered into MCInsts
When I picked up this patch, IIRC, I tried to remove the one in pseudoToMCOpcode
(and make soft waitcnts illegal to lower to MCInst) but it didn't work. I can try again if you want
Note that SIInsertWaitcnts is iterative, so it may process the same block several times. If that happens then by the time the it runs the second time on a block, all remaining soft waitcnts will have been converted to hard ones by the first pass. Is that desirable for some reason?
Not sure, I'll think about it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that SIInsertWaitcnts is iterative, so it may process the same block several times. If that happens then by the time the it runs the second time on a block, all remaining soft waitcnts will have been converted to hard ones by the first pass. Is that desirable for some reason?
So my assumption is that it's a normal process. The first iteration will either eliminate the unneeded waitcnts, or make them non-soft. Further iterations on the block should act like they did before when there were no soft waitcnts.
I've renamed this function to also make it clearer because I think its current name was confusing. It's only called when we update a waitcnt (so when we know the waitcnt is needed) in order to "promote" the soft waitcnt into a normal one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it's OK, but it still seems odd to me that we promote soft waitcnts both here and in pseudoToMCOpcode
.
SIInsertWaitcnts
is required for correctness (it is not just an optimization) so why is the pseudoToMCOpcode
part required?
Alternatively, could we stop promoting soft waitcnts here? Or would that somehow change the behaviour of the second (or subsequent) visit to a basic block?
0a9aeb8
to
3cee7c0
Compare
3cee7c0
to
77cfab8
Compare
(Apologies for the noise but I rebased again and squashed commits as well. There are now 2 commits in this PR, one has a separate review.) |
… known to be 0 already
77cfab8
to
cdcc791
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is OK to commit now.
I'm still a little confused about why we promote soft waitcnts in two places, but at worst one of them is redundant and everything should still work correctly.
@Pierre-vh I am seeing some regressions from this change causing generation of extra unnecessary See attached test:
|
Thanks for the test case! This is a great example of why we should not promote soft waitcnts in applyPreexistingWaitcnt. The full codegen looks like this:
The first time we visit LBB0_1, we have only seen the bb.0 predecessor so we only know about waits due to the global_load_dwordx4, and we insert the "s_waitcnt vmcnt(1)" and "s_waitcnt vmcnt(0)" that you highlighted as redundant. The second time we visit LBB0_1, we have seen the image_sample instruction in the LBB0_1 predecessor, and we insert the "s_waitcnt vmcnt(0)" before the "v_mov_b32_e32 v0, 0". This renders the other two waitcnts redundant, but we can't remove them because we have already promoted them to "hard" waitcnts. |
See #75785 for a fix. |
Since llvm#72830 the memory legalizer tests have not shown s_waitcnt instructions inserted by SIMemoryLegalizer because they have mostly been removed by SIInsertWaitcnts. Checking the MIR immediately after SIMemoryLegalizer runs fixes this so you can see exactly what the pass has inserted.
See #72829 to precommit test changes.
Original patch by @jmmartinez : #65735