diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c5c12ffd9166e..541fe1f4b94ca 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22986,6 +22986,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); + unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); + unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; + + // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to + // concat from subvectors to use VPTRUNC etc. + if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) { + // Truncate with PACKUS if we are truncating a vector with leading zero + // bits that extend all the way to the packed/truncated value. Pre-SSE41 + // we can only use PACKUSWB. + KnownBits Known = DAG.computeKnownBits(In); + if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) + if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, + Subtarget)) + return V; + + // Truncate with PACKSS if we are truncating a vector with sign-bits + // that extend all the way to the packed/truncated value. + if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) + if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, + Subtarget)) + return V; + } + // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) { @@ -23002,25 +23025,6 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return Op; } - unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); - unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; - - // Truncate with PACKUS if we are truncating a vector with leading zero bits - // that extend all the way to the packed/truncated value. - // Pre-SSE41 we can only use PACKUSWB. - KnownBits Known = DAG.computeKnownBits(In); - if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) - if (SDValue V = - truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) - return V; - - // Truncate with PACKSS if we are truncating a vector with sign-bits that - // extend all the way to the packed/truncated value. - if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) - if (SDValue V = - truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) - return V; - // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 906d7cb6854f4..87332531f750b 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -2969,11 +2969,8 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; KNL-NEXT: retq ; @@ -2989,11 +2986,8 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQNOBW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512DQNOBW-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQNOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll index 74697c5413f34..890075206fbda 100644 --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -466,12 +466,10 @@ define <4 x float> @PR45794(<2 x i64> %x, <2 x i64> %y) { ; ; AVX512VL-LABEL: PR45794: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsraq $48, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1 +; AVX512VL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a0 = ashr <2 x i64> %x, %s0 = sitofp <2 x i64> %a0 to <2 x float> diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 39ce19871594d..8fcb5f627f8ef 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -6186,11 +6186,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 5a307925bb7fe..745c59baf710c 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -736,11 +736,8 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v32i16: @@ -749,11 +746,8 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll index 01269b48efc09..832e50d147f6c 100644 --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -104,13 +104,7 @@ define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) noun ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_concat_packsswb_256: @@ -153,13 +147,7 @@ define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) noun ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_concat_packuswb_256: diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll index 26be8013adbbd..1a535cca5c266 100644 --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -181,24 +181,20 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512F-LABEL: concat_trunc_packsswb_512: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -224,24 +220,20 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512F-LABEL: concat_trunc_packuswb_512: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 3eef750029791..23d6f256e1ab6 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -3406,11 +3406,8 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sext_32xi1_to_32xi8: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index b2515181ee7d6..eaa9d24af98fd 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -4516,11 +4516,8 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vpminuw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: @@ -4528,11 +4525,8 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512VL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v32i16_v32i8: