Skip to content

Commit

Permalink
[x86] use more phadd for reductions
Browse files Browse the repository at this point in the history
This is part of what is requested by PR42023:
https://bugs.llvm.org/show_bug.cgi?id=42023

There's an extension needed for FP add, but exactly how we would specify
that using flags is not clear to me, so I left that as a TODO.
We're still missing patterns for partial reductions when the input vector
is 256-bit or 512-bit, but I think that's a failure of vector narrowing.
If we can reduce the widths, then this matching should work on those tests.

Differential Revision: https://reviews.llvm.org/D64760

llvm-svn: 366268
  • Loading branch information
rotateright committed Jul 16, 2019
1 parent f4c2d57 commit d746a21
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 56 deletions.
54 changes: 54 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -35624,6 +35624,57 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
llvm_unreachable("All opcodes should return within switch");
}

/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
return SDValue();
SDValue Index = ExtElt->getOperand(1);
if (!isNullConstant(Index))
return SDValue();

// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;
SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
if (!Rdx)
return SDValue();

EVT VT = ExtElt->getValueType(0);
EVT VecVT = ExtElt->getOperand(0).getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
SDLoc DL(ExtElt);

// 256-bit horizontal instructions operate on 128-bit chunks rather than
// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.
// TODO: We could extend this to handle 512-bit or even longer vectors.
if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
}

/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
Expand Down Expand Up @@ -35710,6 +35761,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;

if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
return V;

if (SDValue V = scalarizeExtEltFP(N, DAG))
return V;

Expand Down
44 changes: 16 additions & 28 deletions llvm/test/CodeGen/X86/phaddsub-extract.ll
Expand Up @@ -1903,10 +1903,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
;
; SSE3-FAST-LABEL: hadd16_8:
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE3-FAST-NEXT: paddw %xmm0, %xmm1
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-FAST-NEXT: paddw %xmm1, %xmm0
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSE3-FAST-NEXT: movd %xmm0, %eax
; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -1926,10 +1924,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
;
; AVX-FAST-LABEL: hadd16_8:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -1956,10 +1952,9 @@ define i32 @hadd32_4(<4 x i32> %x225) {
;
; SSE3-FAST-LABEL: hadd32_4:
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
; SSE3-FAST-NEXT: movd %xmm1, %eax
; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
; SSE3-FAST-NEXT: movd %xmm0, %eax
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: hadd32_4:
Expand All @@ -1973,8 +1968,7 @@ define i32 @hadd32_4(<4 x i32> %x225) {
;
; AVX-FAST-LABEL: hadd32_4:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: retq
Expand Down Expand Up @@ -2097,21 +2091,17 @@ define i32 @hadd32_16(<16 x i32> %x225) {
define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
; SSE3-LABEL: hadd16_8_optsize:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE3-NEXT: paddw %xmm0, %xmm1
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: paddw %xmm1, %xmm0
; SSE3-NEXT: phaddw %xmm0, %xmm0
; SSE3-NEXT: phaddw %xmm0, %xmm0
; SSE3-NEXT: phaddw %xmm0, %xmm0
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd16_8_optsize:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -2129,16 +2119,14 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize {
; SSE3-LABEL: hadd32_4_optsize:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE3-NEXT: paddd %xmm0, %xmm1
; SSE3-NEXT: phaddd %xmm1, %xmm1
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: phaddd %xmm0, %xmm0
; SSE3-NEXT: phaddd %xmm0, %xmm0
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_4_optsize:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
Expand Down
22 changes: 8 additions & 14 deletions llvm/test/CodeGen/X86/vector-reduce-add-widen.ll
Expand Up @@ -254,8 +254,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v4i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: retq
Expand Down Expand Up @@ -307,9 +306,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-FAST-LABEL: test_v8i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
Expand Down Expand Up @@ -635,10 +633,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX1-FAST-LABEL: test_v8i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down Expand Up @@ -704,11 +700,9 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1-FAST-LABEL: test_v16i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
22 changes: 8 additions & 14 deletions llvm/test/CodeGen/X86/vector-reduce-add.ll
Expand Up @@ -241,8 +241,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v4i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: retq
Expand Down Expand Up @@ -294,9 +293,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-FAST-LABEL: test_v8i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
Expand Down Expand Up @@ -605,10 +603,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX1-FAST-LABEL: test_v8i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down Expand Up @@ -674,11 +670,9 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1-FAST-LABEL: test_v16i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down

0 comments on commit d746a21

Please sign in to comment.