diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e9f0a86924d3..d0ee70deb2d52 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14817,6 +14817,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return V; + // Check for compaction patterns. + bool IsSingleInput = V2.isUndef(); + int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput); + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input @@ -14827,10 +14831,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // + // If the mask is a binary compaction, we can more efficiently perform this + // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()). + // TODO: AVX2+ sees a regression as they fail to see through VBROADCAST_LOAD + // masks. + // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. - if (Subtarget.hasSSSE3()) { + if (Subtarget.hasSSSE3() && + (Subtarget.hasInt256() || IsSingleInput || NumEvenDrops != 1)) { bool V1InUse = false; bool V2InUse = false; @@ -14888,8 +14898,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. - bool IsSingleInput = V2.isUndef(); - if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { + if (NumEvenDrops) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 2feb0382d8c9e..3fecf6b8d4c3a 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -4652,10 +4652,10 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE4-NEXT: pshufb %xmm6, %xmm1 -; SSE4-NEXT: pshufb %xmm6, %xmm0 -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE4-NEXT: pand %xmm6, %xmm1 +; SSE4-NEXT: pand %xmm6, %xmm0 +; SSE4-NEXT: packuswb %xmm1, %xmm0 ; SSE4-NEXT: pcmpeqb %xmm7, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %ecx ; SSE4-NEXT: xorl $65535, %ecx # imm = 0xFFFF @@ -4711,14 +4711,14 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; SSE4-NEXT: .LBB15_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) ; SSE4-NEXT: .LBB15_30: # %else28 -; SSE4-NEXT: pshufb %xmm6, %xmm3 -; SSE4-NEXT: pshufb %xmm6, %xmm2 +; SSE4-NEXT: pand %xmm6, %xmm3 +; SSE4-NEXT: pand %xmm6, %xmm2 ; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB15_32 ; SSE4-NEXT: # %bb.31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) ; SSE4-NEXT: .LBB15_32: # %else30 -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE4-NEXT: packuswb %xmm3, %xmm2 ; SSE4-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE4-NEXT: jne .LBB15_33 ; SSE4-NEXT: # %bb.34: # %else32 @@ -5750,10 +5750,10 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE4-NEXT: pshufb %xmm4, %xmm1 -; SSE4-NEXT: pshufb %xmm4, %xmm0 -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE4-NEXT: pand %xmm4, %xmm1 +; SSE4-NEXT: pand %xmm4, %xmm0 +; SSE4-NEXT: packuswb %xmm1, %xmm0 ; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE4-NEXT: pmovmskb %xmm3, %eax ; SSE4-NEXT: xorl $65535, %eax # imm = 0xFFFF diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index e0545c8a584d4..7317f424a4d75 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -13,16 +13,25 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index 3457450a3ee02..a4af6c1eb1e0b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -356,29 +356,17 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { } define i1 @trunc_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: cmpw $-1, %ax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: cmpw $-1, %ax -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: @@ -695,37 +683,21 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { } define i1 @trunc_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: trunc_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: cmpw $-1, %ax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: cmpw $-1, %ax -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index 5a0deab79ae98..025dbcb3ff632 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -350,29 +350,17 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { } define i1 @trunc_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testw %ax, %ax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testw %ax, %ax -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: @@ -689,37 +677,21 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { } define i1 @trunc_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: trunc_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testw %ax, %ax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testw %ax, %ax -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 8014f1f415162..43b6ef57d46f0 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -365,33 +365,19 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { } define i1 @trunc_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: xorb %al, %cl -; SSE2-NEXT: setnp %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: xorb %al, %cl -; SSE41-NEXT: setnp %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: xorb %al, %cl +; SSE-NEXT: setnp %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: @@ -775,41 +761,23 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { } define i1 @trunc_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: trunc_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: xorb %al, %cl -; SSE2-NEXT: setnp %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: xorb %al, %cl -; SSE41-NEXT: setnp %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: xorb %al, %cl +; SSE-NEXT: setnp %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index f3b35423bd7a5..5608917cc84c5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1969,36 +1969,20 @@ define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09( } define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { -; SSE2-LABEL: PR12412: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR12412: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR12412: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; SSE-LABEL: PR12412: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: PR12412: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR12412: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 3ad9ff10f2bad..475576687741a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4753,14 +4753,14 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_ ; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -4808,14 +4808,14 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_ ; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 87235ed9c69db..0e42991c3b2e1 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1026,32 +1026,14 @@ entry: ;PR25684 define void @trunc16i16_16i8(<16 x i16> %a) { -; SSE2-LABEL: trunc16i16_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i16_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i16_16i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8: ; AVX1: # %bb.0: # %entry @@ -1235,44 +1217,18 @@ entry: } define void @trunc32i16_32i8(<32 x i16> %a) { -; SSE2-LABEL: trunc32i16_32i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc32i16_32i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc32i16_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc32i16_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry @@ -1726,37 +1682,29 @@ entry: } define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { -; SSE2-LABEL: trunc2x8i16_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc2x8i16_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq +; SSE-LABEL: trunc2x8i16_16i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: trunc2x8i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; AVX1-LABEL: trunc2x8i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: trunc2x8i16_16i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX2-LABEL: trunc2x8i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry