diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 427c18a4bb576..9da121dd9ab87 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59427,6 +59427,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::SETCC: + if (!IsSplat && EltSizeInBits == 1 && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(0).getValueType() == + Op.getOperand(0).getValueType() && + Op0.getOperand(2) == Op.getOperand(2); + })) { + EVT SrcVT = Op0.getOperand(0).getValueType(); + EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits(); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) && + (NewSrcVT.is256BitVector() || + (NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() && + (SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) { + SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0); + SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1); + if (LHS || RHS) + return DAG.getNode(Opcode, DL, VT, + LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0), + RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1), + Op0.getOperand(2)); + } + } + break; case ISD::CTPOP: case ISD::CTTZ: case ISD::CTLZ: @@ -59791,13 +59816,16 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, } } - // Attempt to merge logic ops if the type is legal. - if (TLI.isTypeLegal(VT) && all_of(Ops, [](SDValue Op) { - return ISD::isBitwiseLogicOp(Op.getOpcode()); - })) + // Attempt to merge comparison/logic ops if the type is legal. + if (TLI.isTypeLegal(VT) && + (all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) || + all_of(Ops, [](SDValue Op) { + return ISD::isBitwiseLogicOp(Op.getOpcode()); + }))) { if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, Subtarget)) return R; + } // Don't do anything else for i1 vectors. return SDValue(); diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll index a24c1d8c2fcc4..7fb20418aeda4 100644 --- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -52,13 +52,12 @@ define <8 x i1> @test3(<4 x i1> %a) { define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovd2m %xmm1, %k0 -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovd2m %xmm0, %k1 -; CHECK-NEXT: kshiftlb $4, %k0, %k0 -; CHECK-NEXT: korb %k0, %k1, %k0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 +; CHECK-NEXT: vpmovd2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> @@ -68,13 +67,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1 -; CHECK-NEXT: vpmovq2m %xmm1, %k0 -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-NEXT: kshiftlb $2, %k0, %k0 -; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0 +; CHECK-NEXT: vpmovq2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-icmp.ll b/llvm/test/CodeGen/X86/combine-icmp.ll index 603917b52cd5f..dba583905c2c5 100644 --- a/llvm/test/CodeGen/X86/combine-icmp.ll +++ b/llvm/test/CodeGen/X86/combine-icmp.ll @@ -83,12 +83,12 @@ define i8 @concat_icmp_v8i32_v4i32(<4 x i32> %a0, <4 x i32> %a1) { ; ; AVX512-LABEL: concat_icmp_v8i32_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: kshiftlb $4, %k1, %k1 -; AVX512-NEXT: korb %k1, %k0, %k0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp eq <4 x i32> %a0, zeroinitializer %v1 = icmp eq <4 x i32> %a1, zeroinitializer @@ -151,12 +151,12 @@ define i16 @concat_icmp_v16i16_v8i16(<8 x i16> %a0, <8 x i16> %a1) { ; ; AVX512-LABEL: concat_icmp_v16i16_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm0, %k0 -; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1 -; AVX512-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp ugt <8 x i16> %a0, splat (i16 1) %v1 = icmp ugt <8 x i16> %a1, splat (i16 1) @@ -199,11 +199,11 @@ define i32 @concat_icmp_v32i8_v16i8(<16 x i8> %a0, <16 x i8> %a1) { ; ; AVX512-LABEL: concat_icmp_v32i8_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; AVX512-NEXT: vpcmpgtb %xmm2, %xmm0, %k0 -; AVX512-NEXT: vpcmpgtb %xmm2, %xmm1, %k1 -; AVX512-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp sgt <16 x i8> %a0, splat (i8 5) %v1 = icmp sgt <16 x i8> %a1, splat (i8 5) @@ -329,21 +329,15 @@ define i8 @concat_icmp_v8i64_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, ; ; AVX512-LABEL: concat_icmp_v8i64_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [128,128] -; AVX512-NEXT: vpcmpltuq %xmm4, %xmm0, %k0 -; AVX512-NEXT: vpcmpltuq %xmm4, %xmm1, %k1 -; AVX512-NEXT: vpcmpltuq %xmm4, %xmm2, %k2 -; AVX512-NEXT: vpcmpltuq %xmm4, %xmm3, %k3 -; AVX512-NEXT: kshiftlb $2, %k3, %k3 -; AVX512-NEXT: korb %k3, %k2, %k2 -; AVX512-NEXT: kshiftlb $4, %k2, %k2 -; AVX512-NEXT: kshiftlb $2, %k1, %k1 -; AVX512-NEXT: korw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlb $4, %k0, %k0 -; AVX512-NEXT: kshiftrb $4, %k0, %k0 -; AVX512-NEXT: korb %k2, %k0, %k0 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp ult <2 x i64> %a0, splat (i64 128) %v1 = icmp ult <2 x i64> %a1, splat (i64 128) @@ -387,18 +381,16 @@ define i16 @concat_icmp_v16i32_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2 ; ; AVX512-LABEL: concat_icmp_v16i32_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vpcmpgtd %xmm4, %xmm0, %k0 -; AVX512-NEXT: vpcmpgtd %xmm4, %xmm1, %k1 -; AVX512-NEXT: vpcmpgtd %xmm4, %xmm2, %k2 -; AVX512-NEXT: vpcmpgtd %xmm4, %xmm3, %k3 -; AVX512-NEXT: kshiftlb $4, %k1, %k1 -; AVX512-NEXT: korb %k1, %k0, %k0 -; AVX512-NEXT: kshiftlb $4, %k3, %k1 -; AVX512-NEXT: korb %k1, %k2, %k1 -; AVX512-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp sgt <4 x i32> %a0, zeroinitializer %v1 = icmp sgt <4 x i32> %a1, zeroinitializer @@ -468,14 +460,14 @@ define i32 @concat_icmp_v32i16_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2 ; ; AVX512-LABEL: concat_icmp_v32i16_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmw %xmm0, %xmm0, %k0 -; AVX512-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k2 -; AVX512-NEXT: vptestmw %xmm3, %xmm3, %k3 -; AVX512-NEXT: kunpckbw %k0, %k1, %k0 -; AVX512-NEXT: kunpckbw %k2, %k3, %k1 -; AVX512-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vptestmw %zmm0, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp ne <8 x i16> %a0, zeroinitializer %v1 = icmp ne <8 x i16> %a1, zeroinitializer @@ -560,15 +552,14 @@ define i64 @concat_icmp_v64i8_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, ; ; AVX512-LABEL: concat_icmp_v64i8_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpcmpnleub %xmm4, %xmm0, %k0 -; AVX512-NEXT: vpcmpnleub %xmm4, %xmm1, %k1 -; AVX512-NEXT: vpcmpnleub %xmm4, %xmm2, %k2 -; AVX512-NEXT: vpcmpnleub %xmm4, %xmm3, %k3 -; AVX512-NEXT: kunpckwd %k0, %k1, %k0 -; AVX512-NEXT: kunpckwd %k2, %k3, %k1 -; AVX512-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, %rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %v0 = icmp ugt <16 x i8> %a0, splat (i8 15) %v1 = icmp ugt <16 x i8> %a1, splat (i8 15) @@ -672,10 +663,9 @@ define i8 @concat_icmp_v8i64_v4i64(<4 x i64> %a0, <4 x i64> %a1) { ; ; AVX512-LABEL: concat_icmp_v8i64_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k0 -; AVX512-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; AVX512-NEXT: kshiftlb $4, %k1, %k1 -; AVX512-NEXT: korb %k1, %k0, %k0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -768,10 +758,9 @@ define i16 @concat_icmp_v16i32_v8i32(<8 x i32> %a0, <8 x i32> %a1) { ; ; AVX512-LABEL: concat_icmp_v16i32_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] -; AVX512-NEXT: vpcmpnleud %ymm2, %ymm0, %k0 -; AVX512-NEXT: vpcmpnleud %ymm2, %ymm1, %k1 -; AVX512-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -830,10 +819,9 @@ define i32 @concat_icmp_v32i16_v16i16(<16 x i16> %a0, <16 x i16> %a1) { ; ; AVX512-LABEL: concat_icmp_v32i16_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %k0 -; AVX512-NEXT: vpcmpgtw %ymm2, %ymm1, %k1 -; AVX512-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -903,10 +891,9 @@ define i64 @concat_icmp_v64i8_v32i8(<32 x i8> %a0, <32 x i8> %a1) { ; ; AVX512-LABEL: concat_icmp_v64i8_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512-NEXT: vpcmpgtb %ymm0, %ymm2, %k0 -; AVX512-NEXT: vpcmpgtb %ymm1, %ymm2, %k1 -; AVX512-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpltb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index ad08eaffab383..7e00d679d56b2 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -43,25 +43,23 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_sext_v16i8: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256-NEXT: kshiftrw $8, %k1, %k1 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; ; AVX512VL-LABEL: testv16i1_sext_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -70,10 +68,8 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_sext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -91,13 +87,13 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_sext_v16i16: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} +; AVX256-NEXT: kshiftrw $8, %k1, %k1 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX256-NEXT: retq @@ -105,10 +101,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512VL-LABEL: testv16i1_sext_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: retq @@ -116,10 +110,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_sext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: retq @@ -173,27 +165,25 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_zext_v16i8: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX256-NEXT: kshiftrw $8, %k1, %k1 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; ; AVX512VL-LABEL: testv16i1_zext_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -202,10 +192,8 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_zext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -223,13 +211,13 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_zext_v16i16: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} +; AVX256-NEXT: kshiftrw $8, %k1, %k1 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0 @@ -238,10 +226,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512VL-LABEL: testv16i1_zext_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0 @@ -250,10 +236,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_zext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 3699c7f75c861..93384341e03a4 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -18,26 +18,23 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z} -; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 -; AVX256VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX256VL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1] -; AVX256VL-NEXT: vpmovsxwd %xmm3, %ymm3 -; AVX256VL-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX256VL-NEXT: vptestmd %ymm3, %ymm3, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX256VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1] -; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm3 {%k1} {z} +; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3 +; AVX256VL-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1] +; AVX256VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4],xmm2[5],xmm4[6,7] +; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7] +; AVX256VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1] +; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX256VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k1, %k1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256VL-NEXT: vzeroupper ; AVX256VL-NEXT: retq ; @@ -135,14 +132,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 -; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3 +; AVX256VL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX256VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z} +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k2, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -153,20 +148,15 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX256VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1 -; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256VL-NEXT: vpmovsxwd %ymm2, %zmm1 +; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k1, %k1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX256VL-NEXT: retq ;