Skip to content

Commit

Permalink
[AMDGPU] Add pseudo wavemode to optimize strict_wqm
Browse files Browse the repository at this point in the history
Strict WQM does not require a WQM transistion if it occurs within
an existing WQM section.
This occurs heavily in GFX11 pixel shaders with LDS_PARAM_LOAD.
Which leads to unnecessary EXEC mask manipulation.

To avoid these transitions, detect WQM -> Strict WQM -> WQM
and substitute new ENTER_PSEUDO_WM/EXIT_PSEUDO_WM markers instead.
These are treat similarly by WWM register pre-allocation pass,
but do not manipulate EXEC or use registers to save EXEC state.

Reviewed By: piotr

Differential Revision: https://reviews.llvm.org/D136813
  • Loading branch information
perlfu committed Oct 28, 2022
1 parent 7aa0968 commit a3646ec
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 23 deletions.
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -2142,6 +2142,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
case AMDGPU::ENTER_PSEUDO_WM:
case AMDGPU::EXIT_PSEUDO_WM: {
// These do nothing.
MI.eraseFromParent();
break;
}
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -188,6 +188,21 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let mayStore = 0;
}

// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
let Uses = [EXEC];
let Defs = [EXEC];
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
}

def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
}

// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
Expand Down
14 changes: 10 additions & 4 deletions llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
Expand Up @@ -163,15 +163,19 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {

unsigned Opc = MI.getOpcode();

if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
Opc == AMDGPU::ENTER_PSEUDO_WM) {
dbgs() << "Entering ";
} else {
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
Opc == AMDGPU::EXIT_PSEUDO_WM);
dbgs() << "Exiting ";
}

if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
} else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
Expand Down Expand Up @@ -214,14 +218,16 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
RegsAssigned |= processDef(MI.getOperand(0));

if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}

if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
Expand Down
51 changes: 50 additions & 1 deletion llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Expand Up @@ -215,6 +215,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
MachineInstr *Exit);

void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
Expand Down Expand Up @@ -1040,6 +1042,31 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}

// Convert a strict mode transition to a pseudo transition.
// This still pre-allocates registers to prevent clobbering,
// but avoids any EXEC mask changes.
void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
MachineInstr *Entry,
MachineInstr *Exit) {
assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);

Register SaveOrig = Entry->getOperand(0).getReg();

MachineInstr *NewEntry =
BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
MachineInstr *NewExit =
BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));

LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
Exit->eraseFromParent();

LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
Entry->eraseFromParent();

LIS->removeInterval(SaveOrig);
}

// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
Expand All @@ -1056,9 +1083,12 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {

SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
MachineInstr *StrictEntry = nullptr;

for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
char PreviousState = State;

if (StateTransition.count(&MI))
State = StateTransition[&MI];

Expand All @@ -1071,6 +1101,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
case AMDGPU::ENTER_STRICT_WQM:
StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
break;
case AMDGPU::EXIT_STRICT_WQM:
if (State == StateWQM && StrictEntry) {
// Transition WQM -> StrictWQM -> WQM detected.
lowerPseudoStrictMode(MBB, StrictEntry, &MI);
}
StrictEntry = nullptr;
break;
case AMDGPU::ENTER_STRICT_WWM:
case AMDGPU::EXIT_STRICT_WWM:
StrictEntry = nullptr;
break;
default:
break;
}
Expand Down Expand Up @@ -1213,7 +1257,12 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
.addImm(-1);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StateStrictWWM;
StateTransition[MI] = StrictStateNeeded;

// Mark block as needing lower so it will be checked for unnecessary transitions.
auto BII = Blocks.find(&MBB);
if (BII != Blocks.end())
BII->second.NeedsLowering = true;
}

void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
Expand Down
20 changes: 2 additions & 18 deletions llvm/test/CodeGen/AMDGPU/wqm.ll
Expand Up @@ -2820,24 +2820,18 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
Expand All @@ -2850,24 +2844,18 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
Expand Down Expand Up @@ -3150,10 +3138,8 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
Expand Down Expand Up @@ -3194,11 +3180,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
Expand Down

0 comments on commit a3646ec

Please sign in to comment.