-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] lowerShuffleAsDecomposedShuffleMerge - prefer permute+unpck patterns vs blend+permute on pre-SSE41 targets #160301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…terns vs blend+permute on pre-SSE41 targets Pre-SSE41 we don't have BLENDI so blend patterns tend to get expanded to more complex shuffles Fixes 128-bit case from llvm#159670
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesPre-SSE41 we don't have BLENDI so blend patterns tend to get expanded to more complex shuffles Fixes 128-bit case from #159670 Full diff: https://github.com/llvm/llvm-project/pull/160301.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2feb76e0eb7b4..1c8de3a8df6e2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11721,10 +11721,19 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
+ // If we don't have blends, see if we can create a cheap unpack.
+ if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
+ (is128BitUnpackShuffleMask(V1Mask, DAG) ||
+ is128BitUnpackShuffleMask(V2Mask, DAG)))
+ if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return PermUnpack;
+
// Only prefer immediate blends to unpack/rotate.
- if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
- DAG, true))
+ if (SDValue BlendPerm =
+ lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
return BlendPerm;
+
// If either input vector provides only a single element which is repeated
// multiple times, unpacking from both input vectors would generate worse
// code. e.g. for
@@ -11736,13 +11745,16 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
if (SDValue UnpackPerm =
lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
return UnpackPerm;
+
if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
+
// Unpack/rotate failed - try again with variable blends.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG))
return BlendPerm;
+
if (VT.getScalarSizeInBits() >= 32)
if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
DL, VT, V1, V2, Mask, Subtarget, DAG))
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4378ee604459e..89cc7a638fa01 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1051,28 +1051,11 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
; PR159670
define <16 x i8> @shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31(<16 x i8> %a, <16 x i8> %b) {
-; SSE2-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
; AVX: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
index b8db14c026bf8..3592ed8a84cb2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -362,11 +362,9 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; AMD10H: # %bb.0:
-; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; AMD10H-NEXT: packuswb %xmm0, %xmm0
+; AMD10H-NEXT: psrld $16, %xmm1
+; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/23039 Here is the relevant piece of the build log for the reference
|
Thank you @RKSimon ! |
Pre-SSE41 we don't have BLENDI so blend patterns tend to get expanded to more complex shuffles
Fixes 128-bit case from #159670