diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3631016b0f5c7..0699ebf6f6f88 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41567,6 +41567,17 @@ static SDValue combineX86ShufflesRecursively( resolveTargetShuffleInputsAndMask(Ops, Mask); } + // Handle the all undef/zero/ones cases. + if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) + return DAG.getUNDEF(RootVT); + if (all_of(Mask, [](int Idx) { return Idx < 0; })) + return getZeroVector(RootVT, Subtarget, DAG, DL); + if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) && + !llvm::is_contained(Mask, SM_SentinelZero)) + return getOnesVector(RootVT, DAG, DL); + + assert(!Ops.empty() && "Shuffle with no inputs detected"); + // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index f7764b1593b51..298858a8fcc73 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -1092,3 +1092,116 @@ define void @packss_zext_v8i1() { store <16 x i16> %tmp11, ptr undef, align 2 ret void } + +define <32 x i16> @PR158415(<8 x i8> %arg) { +; X86-AVX2-LABEL: PR158415: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4] +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; X86-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X86-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u] +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] +; X86-AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[12,13,14,15],zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm1[20,21],zero,zero,ymm1[26,27,28,29,30,31] +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,2] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpbroadcastw %xmm1, %ymm3 +; X86-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6,7,8,9],ymm3[10],ymm0[11,12,13,14,15] +; X86-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,xmm2[u,u],zero,zero +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: PR158415: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4] +; X86-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; X86-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X86-AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; X86-AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X86-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X86-AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X86-AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 +; X86-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X86-AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X86-AVX512-NEXT: vpsrld $16, %xmm2, %xmm2 +; X86-AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; X86-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] +; X86-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; X86-AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX2-LABEL: PR158415: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u] +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[12,13,14,15],zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm1[20,21],zero,zero,ymm1[26,27,28,29,30,31] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,2] +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw %xmm1, %ymm3 +; X64-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6,7,8,9],ymm3[10],ymm0[11,12,13,14,15] +; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; X64-AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,xmm2[u,u],zero,zero +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: PR158415: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4] +; X64-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; X64-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; X64-AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X64-AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X64-AVX512-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; X64-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] +; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq +entry: + %shuffle2 = shufflevector <8 x i8> %arg, <8 x i8> zeroinitializer, <32 x i32> + %conv3 = zext <32 x i8> %shuffle2 to <32 x i16> + %shuffle4 = shufflevector <32 x i16> zeroinitializer, <32 x i16> %conv3, <32 x i32> + %not = xor <32 x i16> %shuffle4, splat (i16 1) + %shuffle5 = shufflevector <32 x i16> zeroinitializer, <32 x i16> %not, <32 x i32> + ret <32 x i16> %shuffle5 +}