From 2655bd51d6a350b1aa71566fa9cbaad64990336a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 11 Aug 2020 18:06:00 +0100
Subject: [PATCH] [X86][SSE] combineShuffleWithHorizOp - canonicalize
 SHUFFLE(HOP(X,Y),HOP(Y,X)) -> SHUFFLE(HOP(X,Y))

Attempt to canonicalize binary shuffles of HOPs with commuted operands to an unary shuffle.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 30 ++++++++++++++++++++++++-
 llvm/test/CodeGen/X86/haddsub-shuf.ll   |  8 -------
 llvm/test/CodeGen/X86/haddsub-undef.ll  |  2 --
 3 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 210d04be7c36e3..9ca4d3386960f4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35886,12 +35886,40 @@ static SDValue combineShuffleWithHorizOp(SDValue N, MVT VT, const SDLoc &DL,
   if (!isHoriz && !isPack)
     return SDValue();
 
-  // Canonicalize unary horizontal ops to only refer to lower halves.
   if (TargetMask.size() == VT0.getVectorNumElements()) {
     int NumElts = VT0.getVectorNumElements();
     int NumLanes = VT0.getSizeInBits() / 128;
     int NumEltsPerLane = NumElts / NumLanes;
     int NumHalfEltsPerLane = NumEltsPerLane / 2;
+
+    // Canonicalize binary shuffles of horizontal ops that use the
+    // same sources to an unary shuffle.
+    // TODO: Try to perform this fold even if the shuffle remains.
+    if (BC0 != BC1) {
+      auto ContainsOps = [](SDValue HOp, SDValue Op) {
+        return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
+      };
+      // Commute if all BC0's ops are contained in BC1.
+      if (ContainsOps(BC1, BC0.getOperand(0)) &&
+          ContainsOps(BC1, BC0.getOperand(1))) {
+        ShuffleVectorSDNode::commuteMask(TargetMask);
+        std::swap(BC0, BC1);
+      }
+      // If BC1 can be represented by BC0, then convert to unary shuffle.
+      if (ContainsOps(BC0, BC1.getOperand(0)) &&
+          ContainsOps(BC0, BC1.getOperand(1))) {
+        for (int &M : TargetMask) {
+          if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
+            continue;
+          int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
+          M -= NumElts + (SubLane * NumHalfEltsPerLane);
+          if (BC1.getOperand(SubLane) != BC0.getOperand(0))
+            M += NumHalfEltsPerLane;
+        }
+      }
+    }
+
+    // Canonicalize unary horizontal ops to only refer to lower halves.
     for (int i = 0; i != NumElts; ++i) {
       int &M = TargetMask[i];
       if (isUndefOrZero(M))
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index d7bacfe04be88a..4a2f8bd0c8b27a 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -910,8 +910,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 ; AVX1_FAST:       # %bb.0:
 ; AVX1_FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX1_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX1_FAST-NEXT:    retq
 ;
@@ -929,8 +927,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX2_FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
@@ -972,8 +968,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
 ; AVX1_FAST-LABEL: PR34724_2:
 ; AVX1_FAST:       # %bb.0:
 ; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX1_FAST-NEXT:    retq
 ;
 ; AVX2_SLOW-LABEL: PR34724_2:
@@ -987,8 +981,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
 ; AVX2_FAST-LABEL: PR34724_2:
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2_FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index f950d0b6a723dd..f8379e629c0245 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -986,8 +986,6 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX-FAST-NEXT:    retq
   %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>