82 changes: 82 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4118,6 +4118,88 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
}

namespace {
class BranchWeightCostModel {
const SIInstrInfo &TII;
const TargetSchedModel &SchedModel;
BranchProbability BranchProb;
static constexpr uint64_t BranchNotTakenCost = 1;
uint64_t BranchTakenCost;
uint64_t ThenCyclesCost;

public:
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
const MachineBasicBlock &Succ,
unsigned ExtraTransformationCosts)
: TII(TII), SchedModel(TII.getSchedModel()),
ThenCyclesCost(ExtraTransformationCosts) {
const MachineBasicBlock &Head = *Branch.getParent();
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());

BranchProb = Head.getSuccProbability(FromIt);
if (BranchProb.isUnknown())
BranchProb = BranchProbability::getZero();
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
}

bool isProfitable(const MachineInstr &MI) {
if (TII.isWaitcnt(MI.getOpcode()))
return false;

ThenCyclesCost += SchedModel.computeInstrLatency(&MI);

// Consider `P = N/D` to be the probability of execz being false (skipping
// the then-block) The transformation is profitable if always executing the
// 'then' block is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execz:
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
// BranchNotTakenCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
((Denominator - Numerator) * BranchTakenCost +
Numerator * BranchNotTakenCost);
}
};
} // namespace

bool SIInstrInfo::mustRetainExeczBranch(
const MachineInstr &Branch, const MachineBasicBlock &From,
const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {

assert(is_contained(Branch.getParent()->successors(), &From));
BranchWeightCostModel CostModel{*this, Branch, From,
ExtraTransformationCosts};

const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;

for (const MachineInstr &MI : MBB) {
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might never be taken when EXEC = 0.
// Hence we should retain cbranch out of the loop lest it become infinite.
if (MI.isConditionalBranch())
return true;

if (MI.isMetaInstruction())
continue;

if (hasUnwantedEffectsWhenEXECEmpty(MI))
return true;

if (!CostModel.isProfitable(MI))
return true;
}
}

return false;
}

bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();

Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;

public:
// The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
Expand All @@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
EXECZ = 3
};

private:
using SetVectorType = SmallSetVector<MachineInstr *, 32>;

static unsigned getBranchOpcode(BranchPredicate Cond);
Expand Down Expand Up @@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Return true if the instruction modifies the mode register.q
static bool modifiesModeRegister(const MachineInstr &MI);

/// Returns true if it's protifable to remove an execz branch from Branch to
/// From
bool mustRetainExeczBranch(const MachineInstr &Branch,
const MachineBasicBlock &From,
const MachineBasicBlock &To,
unsigned ExtraTransformationCosts = 0) const;

/// This function is used to determine if an instruction can be safely
/// executed under EXEC = 0 without hardware error, indeterminate results,
/// and/or visible effects on future vector execution or outside the shader.
/// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
/// used in removing branches over short EXEC = 0 sequences.
/// As such it embeds certain assumptions which may not apply to every case
/// of EXEC = 0 execution.
/// Note: as of 2024 the only use of this is SIPreEmitPeephole and
/// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
/// where it is used in removing branches over short EXEC = 0 sequences. As
/// such it embeds certain assumptions which may not apply to every case of
/// EXEC = 0 execution.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;

/// Returns true if the instruction could potentially depend on the value of
Expand Down
85 changes: 1 addition & 84 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"

using namespace llvm;

Expand All @@ -35,9 +33,6 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineInstr &Branch,
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);

public:
Expand Down Expand Up @@ -299,84 +294,6 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}

namespace {
class BranchWeightCostModel {
const SIInstrInfo &TII;
const TargetSchedModel &SchedModel;
BranchProbability BranchProb;
static constexpr uint64_t BranchNotTakenCost = 1;
uint64_t BranchTakenCost;
uint64_t ThenCyclesCost = 0;

public:
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
const MachineBasicBlock &Succ)
: TII(TII), SchedModel(TII.getSchedModel()) {
const MachineBasicBlock &Head = *Branch.getParent();
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());

BranchProb = Head.getSuccProbability(FromIt);
if (BranchProb.isUnknown())
BranchProb = BranchProbability::getZero();
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
}

bool isProfitable(const MachineInstr &MI) {
if (TII.isWaitcnt(MI.getOpcode()))
return false;

ThenCyclesCost += SchedModel.computeInstrLatency(&MI);

// Consider `P = N/D` to be the probability of execz being false (skipping
// the then-block) The transformation is profitable if always executing the
// 'then' block is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execz:
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
// BranchNotTakenCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
((Denominator - Numerator) * BranchTakenCost +
Numerator * BranchNotTakenCost);
}
};

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineInstr &Branch, const MachineBasicBlock &From,
const MachineBasicBlock &To) const {
assert(is_contained(Branch.getParent()->successors(), &From));
BranchWeightCostModel CostModel{*TII, Branch, From};

const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;

for (const MachineInstr &MI : MBB) {
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might never be taken when EXEC = 0.
// Hence we should retain cbranch out of the loop lest it become infinite.
if (MI.isConditionalBranch())
return true;

if (MI.isMetaInstruction())
continue;

if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;

if (!CostModel.isProfitable(MI))
return true;
}
}

return false;
}
} // namespace

// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
Expand All @@ -396,7 +313,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;

// Consider only when it is legal and profitable
if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
if (TII->mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
Expand Down
59 changes: 40 additions & 19 deletions llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no
; GFX9-LABEL: uniform_br_profitable:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_lt_i32 s21, 1
; GFX9-NEXT: s_cbranch_scc1 .LBB2_2
; GFX9-NEXT: v_cmp_ge_i32_e64 vcc, s21, 1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if.then
; GFX9-NEXT: s_mov_b32 s11, s18
; GFX9-NEXT: s_mov_b32 s10, s17
Expand All @@ -111,26 +111,47 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s19
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX9-NEXT: .LBB2_2: ; %if.end
; GFX9-NEXT: ; %bb.2: ; %if.end
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: uniform_br_profitable:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_lt_i32 s21, 1
; GFX10-NEXT: s_cbranch_scc1 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %if.then
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v1, s19
; GFX10-NEXT: s_mov_b32 s11, s18
; GFX10-NEXT: s_mov_b32 s10, s17
; GFX10-NEXT: s_mov_b32 s9, s16
; GFX10-NEXT: s_mov_b32 s8, s7
; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX10-NEXT: .LBB2_2: ; %if.end
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX1010-LABEL: uniform_br_profitable:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1010-NEXT: v_cmp_ge_i32_e64 vcc_lo, s21, 1
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1010-NEXT: ; %bb.1: ; %if.then
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
; GFX1010-NEXT: s_mov_b32 s11, s18
; GFX1010-NEXT: s_mov_b32 s10, s17
; GFX1010-NEXT: s_mov_b32 s9, s16
; GFX1010-NEXT: s_mov_b32 s8, s7
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1010-NEXT: ; %bb.2: ; %if.end
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
; GFX1010-NEXT: s_mov_b32 exec_lo, s4
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1010-NEXT: s_setpc_b64 s[30:31]
;
; GFX1030-LABEL: uniform_br_profitable:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
; GFX1030-NEXT: v_cmpx_ge_i32_e64 s21, 1
; GFX1030-NEXT: ; %bb.1: ; %if.then
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
; GFX1030-NEXT: s_mov_b32 s11, s18
; GFX1030-NEXT: s_mov_b32 s10, s17
; GFX1030-NEXT: s_mov_b32 s9, s16
; GFX1030-NEXT: s_mov_b32 s8, s7
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1030-NEXT: ; %bb.2: ; %if.end
; GFX1030-NEXT: s_mov_b32 exec_lo, s4
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp = icmp sgt i32 %flag, 0
br i1 %cmp, label %if.then, label %if.end, !prof !1
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@
; GCN-O1-NEXT: Merge disjoint stack slots
; GCN-O1-NEXT: Local Stack Slot Allocation
; GCN-O1-NEXT: Remove dead machine instructions
; GCN-O1-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
Expand Down Expand Up @@ -617,6 +618,7 @@
; GCN-O1-OPTS-NEXT: Merge disjoint stack slots
; GCN-O1-OPTS-NEXT: Local Stack Slot Allocation
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
; GCN-O1-OPTS-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
Expand Down Expand Up @@ -932,6 +934,7 @@
; GCN-O2-NEXT: Merge disjoint stack slots
; GCN-O2-NEXT: Local Stack Slot Allocation
; GCN-O2-NEXT: Remove dead machine instructions
; GCN-O2-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O2-NEXT: MachineDominator Tree Construction
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
Expand Down Expand Up @@ -1260,6 +1263,7 @@
; GCN-O3-NEXT: Merge disjoint stack slots
; GCN-O3-NEXT: Local Stack Slot Allocation
; GCN-O3-NEXT: Remove dead machine instructions
; GCN-O3-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O3-NEXT: MachineDominator Tree Construction
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
Expand Down