From f7ae9c4f0a236a9e984b0aec8f2cfd3aa8699c64 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 23 Sep 2025 14:44:03 +0100 Subject: [PATCH] [X86] lowerShuffleAsDecomposedShuffleMerge - prefer permute+unpck patterns vs blend+permute on pre-SSE41 targets Pre-SSE41 we don't have BLENDI so blend patterns tend to get expanded to more complex shuffles Fixes 128-bit case from #159670 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +++++++++-- .../CodeGen/X86/vector-shuffle-128-v16.ll | 27 ++++--------------- llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll | 8 +++--- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2feb76e0eb7b4..1c8de3a8df6e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11721,10 +11721,19 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { + // If we don't have blends, see if we can create a cheap unpack. + if (!Subtarget.hasSSE41() && VT.is128BitVector() && + (is128BitUnpackShuffleMask(V1Mask, DAG) || + is128BitUnpackShuffleMask(V2Mask, DAG))) + if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return PermUnpack; + // Only prefer immediate blends to unpack/rotate. - if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, - DAG, true)) + if (SDValue BlendPerm = + lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; + // If either input vector provides only a single element which is repeated // multiple times, unpacking from both input vectors would generate worse // code. e.g. for @@ -11736,13 +11745,16 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; + if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; + // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; + if (VT.getScalarSizeInBits() >= 32) if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( DL, VT, V1, V2, Mask, Subtarget, DAG)) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4378ee604459e..89cc7a638fa01 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1051,28 +1051,11 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; PR159670 define <16 x i8> @shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31(<16 x i8> %a, <16 x i8> %b) { -; SSE2-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index b8db14c026bf8..3592ed8a84cb2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -362,11 +362,9 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) { define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; AMD10H: # %bb.0: -; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AMD10H-NEXT: packuswb %xmm0, %xmm0 +; AMD10H-NEXT: psrld $16, %xmm1 +; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AMD10H-NEXT: retq ; ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: