From 74df19ec4de074ac32341a91b87577db9fa4cf95 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Fri, 26 Sep 2025 15:05:41 +0200 Subject: [PATCH 1/3] [AMDGPU][InsertWaitCnts] Refactor some helper functions, NFC - Remove one-line wrappers around a simple function call when they're only used once or twice. - Move very generic helpers into SIInstrInfo - Delete unused functions The goal is simply to reduce the noise in SIInsertWaitCnts without hiding functionality. I focused on moving trivial helpers, or helpers with very descriptive/verbose names (so it doesn't hide too much logic away from the pass), and that have some reusability potential. I'm also trying to make the code style more consistent. It doesn't make sense to see a function call `TII->isXXX` then suddenly call a random `isY` method that just wraps around `TII->isY`. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 115 +++----------------- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 +++++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 23 ++++ 3 files changed, 89 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f291191dbfd5c..602b2c34ef72f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -495,13 +495,6 @@ class SIInsertWaitcnts { bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); - bool isForceEmitWaitcnt() const { - for (auto T : inst_counter_types()) - if (ForceEmitWaitcnt[T]) - return true; - return false; - } - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -570,10 +563,6 @@ class SIInsertWaitcnts { return VmemReadMapping[getVmemType(Inst)]; } - bool hasXcnt() const { return ST->hasWaitXCnt(); } - - bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; - bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -591,7 +580,6 @@ class SIInsertWaitcnts { WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - static bool asynchronouslyWritesSCC(unsigned Opcode); }; // This objects maintains the current score brackets of each wait counter, and @@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(FIRST_LDS_VGPR, T, CurrScore); } - if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) { + if (SIInstrInfo::asynchronouslyWritesSCC(Inst.getOpcode())) { setRegScore(SCC, T, CurrScore); PendingSCCWrite = &Inst; } @@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( return Modified; } -static bool readsVCCZ(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && - !MI.getOperand(1).isUndef(); -} - /// \returns true if the callee inserts an s_waitcnt 0 on function entry. static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { // Currently all conventions wait, but this may not always be the case. @@ -2061,7 +2043,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } - if (hasXcnt() && Op.isDef()) + if (ST->hasWaitXCnt() && Op.isDef()) ScoreBrackets.determineWait(X_CNT, Interval, Wait); } } @@ -2087,10 +2069,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.DsCnt = 0; - } + if (SIInstrInfo::readsVCCZ(MI) && ST->hasReadVCCZBug() && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.DsCnt = 0; } // Verify that the wait is actually needed. @@ -2185,75 +2166,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, return Modified; } -// This is a flat memory operation. Check to see if it has memory tokens other -// than LDS. Other address spaces supported by flat memory operations involve -// global memory. -bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // All flat instructions use the VMEM counter except prefetch. - if (!TII->usesVM_CNT(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access VMEM. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves VMEM. - // Flat operations only supported FLAT, LOCAL (LDS), or address spaces - // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION - // (GDS) address space is not supported by flat operations. Therefore, simply - // return true unless only the LDS address space is found. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - assert(AS != AMDGPUAS::REGION_ADDRESS); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - return true; - } - - return false; -} - -// This is a flat memory operation. Check to see if it has memory tokens for -// either LDS or FLAT. -bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. - if (!TII->usesLGKM_CNT(MI)) - return false; - - // If in tgsplit mode then there can be no use of LDS. - if (ST->isTgSplitEnabled()) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access LDS. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves LDS. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) - return true; - } - - return false; -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { - return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || + return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); } -static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { - auto Opc = Inst.getOpcode(); - return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || - Opc == AMDGPU::GLOBAL_WBINV; -} - // Return true if the next instruction is S_ENDPGM, following fallthrough // blocks if necessary. bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, @@ -2331,7 +2248,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { - if (isGFX12CacheInvOrWBInst(Inst)) { + if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); return; @@ -2341,14 +2258,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, int FlatASCount = 0; - if (mayAccessVMEMThroughFlat(Inst)) { + if (TII->mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; IsVMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); } - if (mayAccessLDSThroughFlat(Inst)) { + if (TII->mayAccessLDSThroughFlat(Inst)) { ++FlatASCount; ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } @@ -2394,7 +2311,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); - } else if (asynchronouslyWritesSCC(Inst.getOpcode())) { + } else if (SIInstrInfo::asynchronouslyWritesSCC(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { @@ -2413,7 +2330,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } - if (!hasXcnt()) + if (!ST->hasWaitXCnt()) return; if (IsVMEMAccess) @@ -2516,12 +2433,6 @@ static bool isWaitInstr(MachineInstr &Inst) { counterTypeForInstr(Opcode).has_value(); } -bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) { - return Opcode == AMDGPU::S_BARRIER_LEAVE || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; -} - // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -2578,7 +2489,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::readsVCCZ(Inst); // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { @@ -2701,7 +2612,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush( bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { if (SIInstrInfo::isFLAT(MI)) - return mayAccessVMEMThroughFlat(MI); + return TII->mayAccessVMEMThroughFlat(MI); return SIInstrInfo::isVMEM(MI); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044ea866342c2..56435a50c87ad 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { }); } +bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // All flat instructions use the VMEM counter except prefetch. + if (!usesVM_CNT(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access VMEM. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves VMEM. + // Flat operations only supported FLAT, LOCAL (LDS), or address spaces + // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION + // (GDS) address space is not supported by flat operations. Therefore, simply + // return true unless only the LDS address space is found. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + assert(AS != AMDGPUAS::REGION_ADDRESS); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + return true; + } + + return false; +} + +bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. + if (!usesLGKM_CNT(MI)) + return false; + + // If in tgsplit mode then there can be no use of LDS. + if (ST.isTgSplitEnabled()) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access LDS. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves LDS. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) + return true; + } + + return false; +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 31a2d55e1baad..12bba3bd3e00b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -688,6 +688,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// to not hit scratch. bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + /// \returns true for FLAT instructions that can access VMEM. + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; + + /// \returns true for FLAT instructions that can access LDS. + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: @@ -748,6 +754,18 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; } + static bool asynchronouslyWritesSCC(unsigned Opcode) { + return Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; + } + + static bool readsVCCZ(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -1011,6 +1029,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER; } + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + static bool isF16PseudoScalarTrans(unsigned Opcode) { return Opcode == AMDGPU::V_S_EXP_F16_e64 || Opcode == AMDGPU::V_S_LOG_F16_e64 || From e751f3029c170a2412d65af27020f02ecab9f85d Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 30 Sep 2025 13:44:31 +0200 Subject: [PATCH 2/3] Make names more specific --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 ++++---- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 602b2c34ef72f..91136fd85c545 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1097,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(FIRST_LDS_VGPR, T, CurrScore); } - if (SIInstrInfo::asynchronouslyWritesSCC(Inst.getOpcode())) { + if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { setRegScore(SCC, T, CurrScore); PendingSCCWrite = &Inst; } @@ -2069,7 +2069,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (SIInstrInfo::readsVCCZ(MI) && ST->hasReadVCCZBug() && + if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { Wait.DsCnt = 0; } @@ -2311,7 +2311,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); - } else if (SIInstrInfo::asynchronouslyWritesSCC(Inst.getOpcode())) { + } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { @@ -2489,7 +2489,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst); // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 12bba3bd3e00b..1ececdcc90d08 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -754,13 +754,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; } - static bool asynchronouslyWritesSCC(unsigned Opcode) { + static bool isSBarrierSCCWrite(unsigned Opcode) { return Opcode == AMDGPU::S_BARRIER_LEAVE || Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; } - static bool readsVCCZ(const MachineInstr &MI) { + static bool isCBranchVCCZRead(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && !MI.getOperand(1).isUndef(); From ed3d7f64f0e8366f9d97ac396710a29c80629229 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 29 Sep 2025 09:56:19 +0200 Subject: [PATCH 3/3] [AMDGPU][SIInsertWaitCnts] De-duplicate code (NFC) Trying to do things that'd be picked up in code review if this pass was re-submitted today. Changes are aimed to be straightforward and non opinionated. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 64 ++++++++++----------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 91136fd85c545..3f9a1f492ace5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1853,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, assert(!MI.isMetaInstruction()); AMDGPU::Waitcnt Wait; + const unsigned Opc = MI.getOpcode(); // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to // verify that nothing was relying on this. The number of buffer invalidates // being handled here should not be expanded. - if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || - MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || - MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { + if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC || + Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV || + Opc == AMDGPU::BUFFER_GL1_INV) { Wait.LoadCnt = 0; } // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN || + Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || + Opc == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } @@ -1884,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // send a message to explicitly release all VGPRs before the stores have // completed, but it is only safe to do this if there are no outstanding // scratch stores. - else if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) { if (!WCG->isOptNone() && (MI.getMF()->getInfo()->isDynamicVGPREnabled() || (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && @@ -1894,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. - else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || - MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) && ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { @@ -1920,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Wait for any pending GDS instruction to complete before any // "Always GDS" instruction. - if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) + if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS()) addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { @@ -1946,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait); } } - } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) { + } else if (Opc == AMDGPU::S_BARRIER_WAIT) { ScoreBrackets.tryClearSCCWriteEvent(&MI); } else { // FIXME: Should not be relying on memoperands. @@ -2061,8 +2057,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // // In all other cases, ensure safety by ensuring that there are no outstanding // memory operations. - if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { + if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && + !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } @@ -2146,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } // XCnt may be already consumed by a load wait. - if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.XCnt != ~0u) { + if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) + Wait.XCnt = ~0u; - if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) + Wait.XCnt = ~0u; - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory dependency - // with another VMEM instruction in flight. - if (Wait.XCnt != ~0u && isVmemAccess(*It)) - Wait.XCnt = ~0u; + // Since the translation for VMEM addresses occur in-order, we can skip the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (isVmemAccess(*It)) + Wait.XCnt = ~0u; + } if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2395,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); if (!OldEventsHasSCCWrite) { PendingSCCWrite = Other.PendingSCCWrite; - } else { - if (PendingSCCWrite != Other.PendingSCCWrite) - PendingSCCWrite = nullptr; + } else if (PendingSCCWrite != Other.PendingSCCWrite) { + PendingSCCWrite = nullptr; } } } @@ -2635,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { - if (MI.mayLoad()) - HasVMemLoad = true; - if (MI.mayStore()) - HasVMemStore = true; + HasVMemLoad |= MI.mayLoad(); + HasVMemStore |= MI.mayStore(); } + for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue;