From 3d4bb78fbe620e5ec930fb7070e7e0d3daa47a2c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 9 Apr 2022 12:52:56 +0100 Subject: [PATCH] [X86][SSE] combineSelect - more aggressively create zero elements in the or(pshufb(x), pshufb(y)) fold When we fold vselect(cond, pshufb(x), pshufb(y)) -> or(pshufb(x), pshufb(y)), ensure we convert all undef elements to zero elements - this should help us expose more known zero elements for deeper chains of these cases. Noticed while triaging Issue #54819 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +-- .../vector-interleaved-load-i8-stride-6.ll | 36 +++++++++---------- .../vector-interleaved-store-i16-stride-3.ll | 2 +- .../vector-interleaved-store-i8-stride-3.ll | 4 +-- .../vector-interleaved-store-i8-stride-6.ll | 16 ++++----- .../CodeGen/X86/vector-shuffle-256-v32.ll | 14 ++++---- 6 files changed, 38 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b3a170f60a312..533882c27cfd8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44002,11 +44002,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // getConstVector sets negative shuffle mask values as undef, so ensure // we hardcode SM_SentinelZero values to zero (0x80). if (CondMask[i] < NumElts) { - LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i]; + LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i]; RHSMask[i] = 0x80; } else { LHSMask[i] = 0x80; - RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i]; + RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i]; } } LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 2d3325b6f1328..54c12589b9439 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -2305,7 +2305,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14,u,u,u,u,u],zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30,u,u,u,u,u],zero,zero,zero,ymm3[20,26] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 @@ -2315,7 +2315,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15,u,u,u,u,u],zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31,u,u,u,u,u],zero,zero,zero,ymm3[21,27] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0 @@ -2326,7 +2326,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10,u,u,u,u,u,u],zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26,u,u,u,u,u,u],zero,zero,ymm3[16,22,28] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero ; AVX2-SLOW-NEXT: vpor %ymm7, %ymm10, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = @@ -2335,7 +2335,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11,u,u,u,u,u,u],zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27,u,u,u,u,u,u],zero,zero,ymm3[17,23,29] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero @@ -2363,7 +2363,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,ymm4[u,u,u,u,u,0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,ymm4[u,u,u,u,u,16,22,28],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30] ; AVX2-SLOW-NEXT: vpor %ymm7, %ymm13, %ymm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] @@ -2376,7 +2376,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,ymm4[u,u,u,u,u,1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,ymm4[u,u,u,u,u,17,23,29],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] @@ -2423,7 +2423,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14,u,u,u,u,u],zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30,u,u,u,u,u],zero,zero,zero,ymm3[20,26] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero ; AVX2-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 @@ -2433,7 +2433,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15,u,u,u,u,u],zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31,u,u,u,u,u],zero,zero,zero,ymm3[21,27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero ; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0 @@ -2444,7 +2444,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10,u,u,u,u,u,u],zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26,u,u,u,u,u,u],zero,zero,ymm3[16,22,28] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = @@ -2453,7 +2453,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11,u,u,u,u,u,u],zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27,u,u,u,u,u,u],zero,zero,ymm3[17,23,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero @@ -2481,7 +2481,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,ymm4[u,u,u,u,u,0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,ymm4[u,u,u,u,u,16,22,28],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] @@ -2494,7 +2494,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,ymm4[u,u,u,u,u,1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,ymm4[u,u,u,u,u,17,23,29],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] @@ -2534,7 +2534,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,ymm11[u,u,u,u,u,2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,ymm11[u,u,u,u,u,18,24,30],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,24,30],zero,zero ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],mem[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,10],zero,zero,zero,ymm12[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,26] ; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4 @@ -2557,7 +2557,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,ymm11[u,u,u,u,u,3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,ymm11[u,u,u,u,u,19,25,31],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[19,25,31],zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm12[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[5,11],zero,zero,zero,ymm12[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[21,27] ; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] @@ -2570,7 +2570,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,ymm11[u,u,u,u,u,u,4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,ymm11[u,u,u,u,u,u,20,26],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,26],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm12[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,6,12],zero,zero,zero,ymm12[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,22,28] ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 @@ -2592,7 +2592,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 ; AVX512-NEXT: kmovd %eax, %k2 ; AVX512-NEXT: vmovdqu8 %ymm3, %ymm8 {%k2} -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,ymm11[u,u,u,u,u,u,5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,ymm11[u,u,u,u,u,u,21,27],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[21,27],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[1,7,13],zero,zero,zero,ymm12[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,23,29] ; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15,u,u,u,u,u] @@ -2609,7 +2609,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[4,10],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12,u,u,u,u,u],zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28,u,u,u,u,u],zero,zero,zero,ymm12[18,24,30] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18,24,30] ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,6,12],zero,zero,zero,ymm11[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,22,28],zero,zero,zero ; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] @@ -2624,7 +2624,7 @@ define void @load_i8_stride6_vf32(<192 x i8>* %in.vec, <32 x i8>* %out.vec0, <32 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[5,11],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13,u,u,u,u,u],zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29,u,u,u,u,u],zero,zero,zero,ymm12[19,25,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[19,25,31] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[1,7,13],zero,zero,zero,ymm11[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[17,23,29],zero,zero,zero ; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index ca7180ff84cb4..4a07232cb6a21 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -110,7 +110,7 @@ define void @vf4(<4 x i16>* %in.vecptr0, <4 x i16>* %in.vecptr1, <4 x i16>* %in. ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, 16(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 3996024a6a1fb..89991e2171143 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -186,7 +186,7 @@ define void @store_i8_stride3_vf8(<8 x i8>* %in.vecptr0, <8 x i8>* %in.vecptr1, ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, 16(%rcx) @@ -201,7 +201,7 @@ define void @store_i8_stride3_vf8(<8 x i8>* %in.vecptr0, <8 x i8>* %in.vecptr1, ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 44e11e979b92c..8f09a2fdc769d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -219,7 +219,7 @@ define void @store_i8_stride6_vf4(<4 x i8>* %in.vecptr0, <4 x i8>* %in.vecptr1, ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vmovq %xmm1, 16(%rax) @@ -240,7 +240,7 @@ define void @store_i8_stride6_vf4(<4 x i8>* %in.vecptr0, <4 x i8>* %in.vecptr1, ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vmovq %xmm1, 16(%rax) @@ -660,7 +660,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29] @@ -671,7 +671,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13,u,u],zero,zero,ymm4[6,14,u,u],zero,zero,ymm4[7,15,u,u],zero,zero,ymm4[16,24,u,u],zero,zero,ymm4[17,25,u,u],zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13],zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero,ymm6[18,26] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4 @@ -679,7 +679,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u,18,26,u,u] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11,u,u],zero,zero,ymm0[4,12,u,u],zero,zero,ymm0[5,13,u,u],zero,zero,ymm0[22,30,u,u],zero,zero,ymm0[23,31,u,u],zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10],zero,zero,zero,zero,ymm2[3,11],zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,ymm2[21,29],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,ymm2[23,31] ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -700,7 +700,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29] @@ -711,7 +711,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13,u,u],zero,zero,ymm4[6,14,u,u],zero,zero,ymm4[7,15,u,u],zero,zero,ymm4[16,24,u,u],zero,zero,ymm4[17,25,u,u],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13],zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero,ymm6[18,26] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 @@ -719,7 +719,7 @@ define void @store_i8_stride6_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u,18,26,u,u] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11,u,u],zero,zero,ymm0[4,12,u,u],zero,zero,ymm0[5,13,u,u],zero,zero,ymm0[22,30,u,u],zero,zero,ymm0[23,31,u,u],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10],zero,zero,zero,zero,ymm2[3,11],zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,ymm2[21,29],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,ymm2[23,31] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 933e8c60dcd2c..ab2215c78ebde 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3395,7 +3395,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -3409,7 +3409,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX2-FAST-ALL-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 @@ -3422,7 +3422,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -3438,7 +3438,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] ; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX512VLBW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -3451,7 +3451,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-FAST-ALL: # %bb.0: ; AVX512VLBW-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u> ; AVX512VLBW-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero +; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX512VLBW-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX512VLBW-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -3466,7 +3466,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] ; AVX512VLBW-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero +; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; AVX512VLBW-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -3501,7 +3501,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23] ; XOPAVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]