Skip to content

Commit cf25346

Browse files
authored
[AMDGPU][GFX1250] Optimize s_wait_xcnt for back-to-back atomic RMWs (#177620)
This patch optimizes the insertion of s_wait_xcnt instruction for sequences of atomic read-modify-write (RMW) operations in the SIInsertWaitcnts pass. The Memory Legalizer conservatively inserts a soft xcnt instruction before each atomic RMW operation as part of PR 168852, which is correct given the nature of atomic operations. However, for back-to-back atomic RMWs, only the first s_wait_xcnt is necessary for better runtime performance. This patch tracks atomic RMW blocks within each basic block and removes redundant soft xcnt instructions, keeping only the first wait in each sequence. An atomic RMW block continues through subsequent atomic RMWs and non-memory instructions (e.g., ALU operations) but is broken by CU-scoped memory operations, atomic stores, or basic block boundaries.
1 parent 6451685 commit cf25346

File tree

2 files changed

+1342
-2
lines changed

2 files changed

+1342
-2
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,21 @@ static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
134134
}
135135
}
136136

137+
static bool isSoftXcnt(MachineInstr &MI) {
138+
return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
139+
}
140+
141+
static bool isAtomicRMW(MachineInstr &MI) {
142+
return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
143+
MI.mayStore();
144+
}
145+
146+
enum class AtomicRMWState {
147+
NewBlock, // Start of a new atomic RMW block
148+
InsideBlock, // Middle of an existing block
149+
NotInBlock // Not in an atomic RMW block
150+
};
151+
137152
/// Integer IDs used to track vector memory locations we may have to wait on.
138153
/// Encoded as u16 chunks:
139154
///
@@ -645,6 +660,8 @@ class SIInsertWaitcnts {
645660
WaitcntBrackets &ScoreBrackets);
646661
void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
647662
bool ExpertMode) const;
663+
AtomicRMWState getAtomicRMWState(MachineInstr &MI,
664+
AtomicRMWState PrevState) const;
648665
};
649666

650667
// This objects maintains the current score brackets of each wait counter, and
@@ -2866,6 +2883,39 @@ void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
28662883
.addImm(EncodedReg);
28672884
}
28682885

2886+
// Track back-to-back atomic RMW instructions, referred to as a block.
2887+
//
2888+
// Determines whether \p MI starts a new atomic RMW block, is inside
2889+
// an existing block, or is outside of a block. A block is broken when a
2890+
// CU-scoped memory op or an atomic store is encountered. ALU ops
2891+
// and non-memory instructions don't break a block. The function returns
2892+
// the new state after processing the current instruction based on
2893+
// \p PrevState, the previously captured state.
2894+
AtomicRMWState
2895+
SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
2896+
AtomicRMWState PrevState) const {
2897+
if (isAtomicRMW(MI)) {
2898+
// Transition from NotInBlock -> NewBlock -> InsideBlock.
2899+
if (PrevState == AtomicRMWState::NotInBlock)
2900+
return AtomicRMWState::NewBlock;
2901+
if (PrevState == AtomicRMWState::NewBlock)
2902+
return AtomicRMWState::InsideBlock;
2903+
2904+
return PrevState;
2905+
}
2906+
2907+
// LDS memory operations don't break the block.
2908+
if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
2909+
return PrevState;
2910+
2911+
// Reset the atomic RMW block state when found other VMEM and SMEM operations.
2912+
if (MI.mayLoad() ^ MI.mayStore())
2913+
return AtomicRMWState::NotInBlock;
2914+
2915+
// Return the previous state otherwise.
2916+
return PrevState;
2917+
}
2918+
28692919
// Generate s_waitcnt instructions where needed.
28702920
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
28712921
MachineBasicBlock &Block,
@@ -2894,6 +2944,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
28942944

28952945
// Walk over the instructions.
28962946
MachineInstr *OldWaitcntInstr = nullptr;
2947+
AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
28972948

28982949
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
28992950
E = Block.instr_end();
@@ -2903,14 +2954,32 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
29032954
++Iter;
29042955
continue;
29052956
}
2957+
// Get the atomic RMW block state for current instruction.
2958+
RMWState = getAtomicRMWState(Inst, RMWState);
29062959

29072960
// Track pre-existing waitcnts that were added in earlier iterations or by
29082961
// the memory legalizer.
29092962
if (isWaitInstr(Inst) ||
29102963
(IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2911-
if (!OldWaitcntInstr)
2912-
OldWaitcntInstr = &Inst;
29132964
++Iter;
2965+
bool IsSoftXcnt = isSoftXcnt(Inst);
2966+
// The Memory Legalizer conservatively inserts a soft xcnt before each
2967+
// atomic RMW operation. However, for sequences of back-to-back atomic
2968+
// RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
2969+
// the redundant soft xcnts when we're inside an atomic RMW block.
2970+
if (Iter != E && IsSoftXcnt) {
2971+
// Check if the next instruction can potentially change the atomic RMW
2972+
// state.
2973+
RMWState = getAtomicRMWState(*Iter, RMWState);
2974+
}
2975+
2976+
if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
2977+
// Delete this soft xcnt.
2978+
Inst.eraseFromParent();
2979+
Modified = true;
2980+
} else if (!OldWaitcntInstr) {
2981+
OldWaitcntInstr = &Inst;
2982+
}
29142983
continue;
29152984
}
29162985

0 commit comments

Comments
 (0)