Skip to content

Commit

Permalink
[X86][SSE] combineShuffleWithHorizOp - canonicalize SHUFFLE(HOP(X,Y),…
Browse files Browse the repository at this point in the history
…HOP(Y,X)) -> SHUFFLE(HOP(X,Y))

Attempt to canonicalize binary shuffles of HOPs with commuted operands to an unary shuffle.
  • Loading branch information
RKSimon committed Aug 11, 2020
1 parent 8dd2eb1 commit 2655bd5
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 11 deletions.
30 changes: 29 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -35886,12 +35886,40 @@ static SDValue combineShuffleWithHorizOp(SDValue N, MVT VT, const SDLoc &DL,
if (!isHoriz && !isPack)
return SDValue();

// Canonicalize unary horizontal ops to only refer to lower halves.
if (TargetMask.size() == VT0.getVectorNumElements()) {
int NumElts = VT0.getVectorNumElements();
int NumLanes = VT0.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
int NumHalfEltsPerLane = NumEltsPerLane / 2;

// Canonicalize binary shuffles of horizontal ops that use the
// same sources to an unary shuffle.
// TODO: Try to perform this fold even if the shuffle remains.
if (BC0 != BC1) {
auto ContainsOps = [](SDValue HOp, SDValue Op) {
return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
};
// Commute if all BC0's ops are contained in BC1.
if (ContainsOps(BC1, BC0.getOperand(0)) &&
ContainsOps(BC1, BC0.getOperand(1))) {
ShuffleVectorSDNode::commuteMask(TargetMask);
std::swap(BC0, BC1);
}
// If BC1 can be represented by BC0, then convert to unary shuffle.
if (ContainsOps(BC0, BC1.getOperand(0)) &&
ContainsOps(BC0, BC1.getOperand(1))) {
for (int &M : TargetMask) {
if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
continue;
int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
M -= NumElts + (SubLane * NumHalfEltsPerLane);
if (BC1.getOperand(SubLane) != BC0.getOperand(0))
M += NumHalfEltsPerLane;
}
}
}

// Canonicalize unary horizontal ops to only refer to lower halves.
for (int i = 0; i != NumElts; ++i) {
int &M = TargetMask[i];
if (isUndefOrZero(M))
Expand Down
8 changes: 0 additions & 8 deletions llvm/test/CodeGen/X86/haddsub-shuf.ll
Expand Up @@ -910,8 +910,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX1_FAST-NEXT: retq
;
Expand All @@ -929,8 +927,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
; AVX2_FAST: # %bb.0:
; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX2_FAST-NEXT: retq
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
Expand Down Expand Up @@ -972,8 +968,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
; AVX1_FAST-LABEL: PR34724_2:
; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_FAST-NEXT: retq
;
; AVX2_SLOW-LABEL: PR34724_2:
Expand All @@ -987,8 +981,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
; AVX2_FAST-LABEL: PR34724_2:
; AVX2_FAST: # %bb.0:
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_FAST-NEXT: retq
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
%t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/X86/haddsub-undef.ll
Expand Up @@ -986,8 +986,6 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX-FAST-NEXT: retq
%3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
Expand Down

0 comments on commit 2655bd5

Please sign in to comment.