Skip to content

Commit

Permalink
[AMDGPU] Optimize atomic max/min
Browse files Browse the repository at this point in the history
Summary:
Extend the atomic optimizer to handle signed and unsigned max and min
operations, as well as add and subtract.

Reviewers: arsenm, sheredom, critson, rampitec

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64328

llvm-svn: 366235
  • Loading branch information
jayfoad committed Jul 16, 2019
1 parent c6fd5ab commit 17060f0
Show file tree
Hide file tree
Showing 2 changed files with 249 additions and 36 deletions.
177 changes: 141 additions & 36 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ enum DPP_CTRL {

struct ReplacementInfo {
Instruction *I;
Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
};
Expand All @@ -55,8 +55,8 @@ class AMDGPUAtomicOptimizer : public FunctionPass,
bool HasDPP;
bool IsPixelShader;

void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
unsigned ValIdx, bool ValDivergent) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;

public:
static char ID;
Expand Down Expand Up @@ -120,16 +120,17 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
break;
}

Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op = I.getOperation();

switch (I.getOperation()) {
switch (Op) {
default:
return;
case AtomicRMWInst::Add:
Op = Instruction::Add;
break;
case AtomicRMWInst::Sub:
Op = Instruction::Sub;
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
break;
}

Expand Down Expand Up @@ -161,20 +162,40 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
}

void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op;

switch (I.getIntrinsicID()) {
default:
return;
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
Op = Instruction::Add;
Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
Op = Instruction::Sub;
Op = AtomicRMWInst::Sub;
break;
case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
Op = AtomicRMWInst::UMax;
break;
}

Expand Down Expand Up @@ -206,8 +227,57 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
ToReplace.push_back(Info);
}

// Use the builder to create the non-atomic counterpart of the specified
// atomicrmw binary op.
static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
Value *LHS, Value *RHS) {
CmpInst::Predicate Pred;

switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
return B.CreateBinOp(Instruction::Add, LHS, RHS);
case AtomicRMWInst::Sub:
return B.CreateBinOp(Instruction::Sub, LHS, RHS);

case AtomicRMWInst::Max:
Pred = CmpInst::ICMP_SGT;
break;
case AtomicRMWInst::Min:
Pred = CmpInst::ICMP_SLT;
break;
case AtomicRMWInst::UMax:
Pred = CmpInst::ICMP_UGT;
break;
case AtomicRMWInst::UMin:
Pred = CmpInst::ICMP_ULT;
break;
}
Value *Cond = B.CreateICmp(Pred, LHS, RHS);
return B.CreateSelect(Cond, LHS, RHS);
}

static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
case AtomicRMWInst::UMax:
return APInt::getMinValue(BitWidth);
case AtomicRMWInst::UMin:
return APInt::getMaxValue(BitWidth);
case AtomicRMWInst::Max:
return APInt::getSignedMinValue(BitWidth);
case AtomicRMWInst::Min:
return APInt::getSignedMaxValue(BitWidth);
}
}

void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Instruction::BinaryOps Op,
AtomicRMWInst::BinOp Op,
unsigned ValIdx,
bool ValDivergent) const {
// Start building just before the instruction.
Expand Down Expand Up @@ -266,24 +336,24 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,

Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);

Value *LaneOffset = nullptr;
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));

Value *ExclScan = nullptr;
Value *NewV = nullptr;

// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
Value *const Identity = B.getIntN(TyBitWidth, 0);

// First we need to set all inactive invocations to 0, so that they can
// correctly contribute to the final result.
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
CallInst *const SetInactive =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});

CallInst *const FirstDPP =
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
NewV = FirstDPP;
ExclScan = FirstDPP;

const unsigned Iters = 7;
const unsigned DPPCtrl[Iters] = {
Expand All @@ -295,21 +365,20 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;
Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,
{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});

NewV = B.CreateBinOp(Op, NewV, DPP);
ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}

LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
NewV = B.CreateBinOp(Op, SetInactive, NewV);
NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);

// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value with which
// we will provide to the atomic operation.
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
Expand All @@ -324,24 +393,41 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
{}, {NewV, B.getInt32(63)});
NewV = ReadLane;
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, B.getInt32(63)});
} else {
llvm_unreachable("Unhandled atomic bit width");
}

// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {
// Get the total number of active lanes we have by using popcount.
Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);

// Calculate the new value we will be contributing to the atomic operation
// for the entire wavefront.
NewV = B.CreateMul(V, CtpopCast);
LaneOffset = B.CreateMul(V, MbcntCast);
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");

case AtomicRMWInst::Add:
case AtomicRMWInst::Sub: {
// Get the total number of active lanes we have by using popcount.
Instruction *const Ctpop =
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);

// Calculate the new value we will be contributing to the atomic operation
// for the entire wavefront.
NewV = B.CreateMul(V, CtpopCast);
break;
}

case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
// Max/min with a uniform value is idempotent: doing the atomic operation
// multiple times has the same effect as doing it once.
NewV = V;
break;
}
}

// We only want a single lane to enter our new control flow, and we do this
Expand Down Expand Up @@ -407,7 +493,26 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.
Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
Value *LaneOffset = nullptr;
if (ValDivergent) {
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
} else {
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
LaneOffset = B.CreateMul(V, MbcntCast);
break;
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
LaneOffset = B.CreateSelect(Cond, Identity, V);
break;
}
}
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);

if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.
Expand Down
108 changes: 108 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,111 @@ entry:
store i64 %old, i64 addrspace(1)* %out
ret void
}

; GCN-LABEL: max_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: max_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}

; GCN-LABEL: min_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: min_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}

; GCN-LABEL: umax_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: umax_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}

; GCN-LABEL: umin_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: umin_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}

0 comments on commit 17060f0

Please sign in to comment.