Skip to content

Commit

Permalink
AMDGPU/SDAG: Refine the fold to v_mad_[iu]64_[iu]32
Browse files Browse the repository at this point in the history
Only fold for uniform values on pre-GFX9 chips. GFX9+ allow us
to keep the calculation entirely on the SALU.

For subtargets where integer multiplication isn't full-rate, avoid
folding if the multiply has too many uses.

Finally, we expand 64x32 and 64x64 multiplies here as well, if they
feed into an addition. This results in better code generation than
the generic expansion for such multiplies because we end up using
the accumulator of the MAD instructions.

Differential Revision: https://reviews.llvm.org/D123835
  • Loading branch information
nhaehnle committed May 10, 2022
1 parent 009f6ce commit 6c2a01c
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 125 deletions.
109 changes: 95 additions & 14 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10691,7 +10691,14 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}

// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z).
// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
// multiplies, if any.
//
// Full 64-bit multiplies that feed into an addition are lowered here instead
// of using the generic expansion. The generic expansion ends up with
// a tree of ADD nodes that prevents us from using the "add" part of the
// MAD instruction. The expansion produced here results in a chain of ADDs
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::ADD);
Expand All @@ -10705,6 +10712,11 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
if (VT.isVector())
return SDValue();

// S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
// result in scalar registers for uniform values.
if (!N->isDivergent() && Subtarget->hasSMulHi())
return SDValue();

unsigned NumBits = VT.getScalarSizeInBits();
if (NumBits <= 32 || NumBits > 64)
return SDValue();
Expand All @@ -10714,27 +10726,96 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
std::swap(LHS, RHS);
}

// Avoid the fold if it would unduly increase the number of multiplies due to
// multiple uses, except on hardware with full-rate multiply-add (which is
// part of full-rate 64-bit ops).
if (!Subtarget->hasFullRate64Ops()) {
unsigned NumUsers = 0;
for (SDNode *Use : LHS->uses()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
if (Use->getOpcode() != ISD::ADD)
return SDValue();

// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
// MUL + 3xADD + 3xADDC over 3xMAD.
++NumUsers;
if (NumUsers >= 3)
return SDValue();
}
}

SDValue MulLHS = LHS.getOperand(0);
SDValue MulRHS = LHS.getOperand(1);
SDValue AddRHS = RHS;

// TODO: Maybe restrict if SGPR inputs.
if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
numBitsUnsigned(MulRHS, DAG) <= 32) {
MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
// Always check whether operands are small unsigned values, since that
// knowledge is useful in more cases. Check for small signed values only if
// doing so can unlock a shorter code sequence.
bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;

bool MulSignedLo = false;
if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
numBitsSigned(MulRHS, DAG) <= 32;
}

if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
// The operands and final result all have the same number of bits. If
// operands need to be extended, they can be extended with garbage. The
// resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
// truncated away in the end.
if (VT != MVT::i64) {
MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
}

return SDValue();
// The basic code generated is conceptually straightforward. Pseudo code:
//
// accum = mad_64_32 lhs.lo, rhs.lo, accum
// accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
// accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
//
// The second and third lines are optional, depending on whether the factors
// are {sign,zero}-extended or not.
//
// The actual DAG is noisier than the pseudo code, but only due to
// instructions that disassemble values into low and high parts, and
// assemble the final result.
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
SDValue One = DAG.getConstant(1, SL, MVT::i32);

auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
SDValue Accum =
getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);

if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);

if (!MulLHSUnsigned32) {
auto MulLHSHi =
DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
}

if (!MulRHSUnsigned32) {
auto MulRHSHi =
DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
}

Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
Accum = DAG.getBitcast(MVT::i64, Accum);
}

if (VT != MVT::i64)
Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
return Accum;
}

SDValue SITargetLowering::performAddCombine(SDNode *N,
Expand Down
48 changes: 21 additions & 27 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -830,17 +830,16 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
Expand Down Expand Up @@ -874,17 +873,16 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -922,16 +920,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -968,16 +964,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down
62 changes: 28 additions & 34 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -968,18 +968,17 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1009,18 +1008,17 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1052,16 +1050,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1092,16 +1088,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v2
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down
Loading

0 comments on commit 6c2a01c

Please sign in to comment.