Skip to content

Commit

Permalink
[X86][SSE] Don't fold shuffle(binop(),binop()) -> binop(shuffle(),shu…
Browse files Browse the repository at this point in the history
…ffle()) if the shuffle are splats

rGbe69e66b1cd8 added the fold, but DAGCombiner.visitVECTOR_SHUFFLE doesn't merge shuffles if the inner shuffle is a splat, so we need to bail.

The non-fast-horiz-ops paths see some minor regressions, we might be able to improve on this after lowering to target shuffles.

Fix PR48823
  • Loading branch information
RKSimon committed Jan 22, 2021
1 parent 9ae73cd commit ffe72f9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 23 deletions.
15 changes: 8 additions & 7 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37964,23 +37964,24 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return HAddSub;

// Merge shuffles through binops if its likely we'll be able to merge it
// with other shuffles.
// with other shuffles (as long as they aren't splats).
// shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
// TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
unsigned SrcOpcode = N->getOperand(0).getOpcode();
if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
N->isOnlyUserOf(N->getOperand(0).getNode()) &&
N->isOnlyUserOf(N->getOperand(1).getNode()) &&
VT.getScalarSizeInBits() >= 32) {
N->isOnlyUserOf(N->getOperand(1).getNode())) {
SDValue Op00 = N->getOperand(0).getOperand(0);
SDValue Op10 = N->getOperand(1).getOperand(0);
SDValue Op01 = N->getOperand(0).getOperand(1);
SDValue Op11 = N->getOperand(1).getOperand(1);
if ((Op00.getOpcode() == ISD::VECTOR_SHUFFLE ||
Op10.getOpcode() == ISD::VECTOR_SHUFFLE) &&
(Op01.getOpcode() == ISD::VECTOR_SHUFFLE ||
Op11.getOpcode() == ISD::VECTOR_SHUFFLE)) {
auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
SDLoc DL(N);
ArrayRef<int> Mask = SVN->getMask();
SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
Expand Down
35 changes: 19 additions & 16 deletions llvm/test/CodeGen/X86/haddsub-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -161,46 +161,49 @@ define <4 x float> @PR48823(<4 x float> %0, <4 x float> %1) {
; SSE2-LABEL: PR48823:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
; SSE2-NEXT: subps %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
; SSE2-NEXT: subps %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; SSE2-NEXT: retq
;
; SSSE3-SLOW-LABEL: PR48823:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: subps %xmm2, %xmm0
; SSSE3-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSSE3-SLOW-NEXT: subps %xmm1, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR48823:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm2
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSSE3-FAST-NEXT: subps %xmm2, %xmm0
; SSSE3-FAST-NEXT: hsubps %xmm1, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR48823:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-SLOW-NEXT: vsubps %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX1-SLOW-NEXT: vsubps %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR48823:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3]
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; AVX1-FAST-NEXT: vsubps %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: PR48823:
; AVX2: # %bb.0:
; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3]
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX2-NEXT: vsubps %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%3 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%4 = fsub <4 x float> %0, %3
Expand Down

0 comments on commit ffe72f9

Please sign in to comment.