[X86] Lower vector interleave into unpck and perm

[This Godbolt link](https://godbolt.org/z/s17Kv1s9T) shows different codegen between clang and gcc for a transpose operation. clang result: ``` vmovdqu xmm0, xmmword ptr [rcx + rax] vmovdqu xmm1, xmmword ptr [rcx + rax + 16] vmovdqu xmm2, xmmword ptr [r8 + rax] vmovdqu xmm3, xmmword ptr [r8 + rax + 16] vpunpckhbw xmm4, xmm2, xmm0 vpunpcklbw xmm0, xmm2, xmm0 vpunpcklbw xmm2, xmm3, xmm1 vpunpckhbw xmm1, xmm3, xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 48], xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 32], xmm2 vmovdqu xmmword ptr [rdi + 2*rax], xmm0 vmovdqu xmmword ptr [rdi + 2*rax + 16], xmm4 ``` gcc result: ``` vmovdqu ymm3, YMMWORD PTR [rdi+rax] vpunpcklbw ymm1, ymm3, YMMWORD PTR [rsi+rax] vpunpckhbw ymm0, ymm3, YMMWORD PTR [rsi+rax] vperm2i128 ymm2, ymm1, ymm0, 32 vperm2i128 ymm1, ymm1, ymm0, 49 vmovdqu YMMWORD PTR [rcx+rax*2], ymm2 vmovdqu YMMWORD PTR [rcx+32+rax*2], ymm1 ``` clang's code is roughly 15% slower than gcc's when evaluated on an internal compression benchmark. The loop vectorizer generates the following shufflevector intrinsic: ``` %interleaved.vec = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> ``` which is lowered to SelectionDAG: ``` t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t6: v64i8 = concat_vectors t2, undef:v32i8 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t7: v64i8 = concat_vectors t4, undef:v32i8 t8: v64i8 = vector_shuffle<0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95> t6, t7 ``` So far this `vector_shuffle` is good enough for us to pattern-match and transform, but as we go down the SelectionDAG pipeline, it got split into smaller shuffles. During dagcombine1, the shuffle is split by `foldShuffleOfConcatUndefs`. ``` // shuffle (concat X, undef), (concat Y, undef), Mask --> // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t19: v32i8 = vector_shuffle<0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47> t2, t4 t15: ch,glue = CopyToReg t0, Register:v32i8 $ymm0, t19 t20: v32i8 = vector_shuffle<16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63> t2, t4 t17: ch,glue = CopyToReg t15, Register:v32i8 $ymm1, t20, t15:1 ``` With `foldShuffleOfConcatUndefs` commented out, the vector is still split later by the type legalizer, which comes after dagcombine1, because v64i8 is not a legal type in AVX2 (64 * 8 = 512 bits while ymm = 256 bits). There doesn't seem to be a good way to avoid this split. Lowering the `vector_shuffle` into unpck and perm during dagcombine1 is too early. Therefore, although somewhat inconvenient, we decided to go with pattern-matching a pair vector shuffles later in the SelectionDAG pipeline, as part of `lowerV32I8Shuffle`. The code looks at the two operands of the first shuffle it encounters, iterates through the users of the operands, and tries to find two shuffles that are consecutive interleaves. Once the pattern is found, it lowers them into unpcks and perms. It returns the perm for the shuffle that's currently being lowered (have ISel modify the DAG), and replaces the other shuffle in place. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D134477
llvm · Oct 17, 2022 · d0d48a9 · d0d48a9
1 parent 4467c78
commit d0d48a9
Show file tree

Hide file tree

Showing 6 changed files with 335 additions and 233 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17775,6 +17775,90 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
                      DAG.getIntPtrConstant(0, DL));
 }
 
+// a = shuffle v1, v2, mask1    ; interleaving lower lanes of v1 and v2
+// b = shuffle v1, v2, mask2    ; interleaving higher lanes of v1 and v2
+//     =>
+// ul = unpckl v1, v2
+// uh = unpckh v1, v2
+// a = vperm ul, uh
+// b = vperm ul, uh
+//
+// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
+// and permute. We cannot directly match v3 because it is split into two
+// 256-bit vectors in earlier isel stages. Therefore, this function matches a
+// pair of 256-bit shuffles and makes sure the masks are consecutive.
+//
+// Once unpck and permute nodes are created, the permute corresponding to this
+// shuffle is returned, while the other permute replaces the other half of the
+// shuffle in the selection dag.
+static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
+                                                 SDValue V1, SDValue V2,
+                                                 ArrayRef<int> Mask,
+                                                 SelectionDAG &DAG) {
+  if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
+      VT != MVT::v32i8)
+    return SDValue();
+  // <B0, B1, B0+1, B1+1, ..., >
+  auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
+                                   unsigned Begin1) {
+    size_t Size = Mask.size();
+    assert(Size % 2 == 0 && "Expected even mask size");
+    for (unsigned I = 0; I < Size; I += 2) {
+      if (Mask[I] != (int)(Begin0 + I / 2) ||
+          Mask[I + 1] != (int)(Begin1 + I / 2))
+        return false;
+    }
+    return true;
+  };
+  // Check which half is this shuffle node
+  int NumElts = VT.getVectorNumElements();
+  size_t FirstQtr = NumElts / 2;
+  size_t ThirdQtr = NumElts + NumElts / 2;
+  bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
+  bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
+  if (!IsFirstHalf && !IsSecondHalf)
+    return SDValue();
+
+  // Find the intersection between shuffle users of V1 and V2.
+  SmallVector<SDNode *, 2> Shuffles;
+  for (SDNode *User : V1->uses())
+    if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
+        User->getOperand(1) == V2)
+      Shuffles.push_back(User);
+  // Limit user size to two for now.
+  if (Shuffles.size() != 2)
+    return SDValue();
+  // Find out which half of the 512-bit shuffles is each smaller shuffle
+  auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
+  auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
+  SDNode *FirstHalf;
+  SDNode *SecondHalf;
+  if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
+      IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
+    FirstHalf = Shuffles[0];
+    SecondHalf = Shuffles[1];
+  } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
+             IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
+    FirstHalf = Shuffles[1];
+    SecondHalf = Shuffles[0];
+  } else {
+    return SDValue();
+  }
+  // Lower into unpck and perm. Return the perm of this shuffle and replace
+  // the other.
+  SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+  SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+  SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
+                              DAG.getTargetConstant(0x20, DL, MVT::i8));
+  SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
+                              DAG.getTargetConstant(0x31, DL, MVT::i8));
+  if (IsFirstHalf) {
+    DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
+    return Perm1;
+  }
+  DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
+  return Perm2;
+}
 
 /// Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
@@ -18082,6 +18166,16 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                          DAG, Subtarget))
       return V;
 
+  // Try to match an interleave of two v8f32s and lower them as unpck and
+  // permutes using ymms. This needs to go before we try to split the vectors.
+  //
+  // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
+  // this path inadvertently.
+  if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
+    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
+                                                      Mask, DAG))
+      return V;
+
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code using vpunpcklwd and
   // vpunpckhwd instrs than vblend.
@@ -18120,6 +18214,13 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                    Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // Try to match an interleave of two v8i32s and lower them as unpck and
+  // permutes using ymms. This needs to go before we try to split the vectors.
+  if (!Subtarget.hasAVX512())
+    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
+                                                      Mask, DAG))
+      return V;
+
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code than vblend by using
   // vpunpcklwd and vpunpckhwd instrs.
@@ -18325,6 +18426,13 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
     return V;
 
+  // Try to match an interleave of two v16i16s and lower them as unpck and
+  // permutes using ymms.
+  if (!Subtarget.hasAVX512())
+    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
+                                                      Mask, DAG))
+      return V;
+
   // Otherwise fall back on generic lowering.
   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
                                     Subtarget, DAG);
@@ -18438,6 +18546,13 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                   Mask, Zeroable, DAG))
       return V;
 
+  // Try to match an interleave of two v32i8s and lower them as unpck and
+  // permutes using ymms.
+  if (!Subtarget.hasAVX512())
+    if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
+                                                      Mask, DAG))
+      return V;
+
   // Otherwise fall back on generic lowering.
   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
                                     Subtarget, DAG);

diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -492,15 +492,11 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
 ; AVX2-SLOW-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
-; AVX2-SLOW-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1]
+; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX2-SLOW-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i16: