diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index fd47bb3024680..b7c468a9579c8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -273,6 +273,7 @@ namespace { SDValue visitANY_EXTEND(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); + SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); @@ -1396,6 +1397,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); + case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); @@ -5722,7 +5724,8 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, EVT VT = N->getValueType(0); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || - Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) + Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 @@ -7000,6 +7003,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (N0.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(VT); + + if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, + LegalOperations)) + return SDValue(Res, 0); + + return SDValue(); +} + SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index e9f24a17408fa..f8c2f1b1fc69e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -863,7 +863,7 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) { int NumSrcElements = SrcVT.getVectorNumElements(); // Build up a zero vector to blend into this one. - SDValue Zero = DAG.getTargetConstant(0, DL, SrcVT); + SDValue Zero = DAG.getConstant(0, DL, SrcVT); // Shuffle the incoming lanes into the correct position, and pull all other // lanes from the zero vector. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a686ee086333a..789070232158c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28481,13 +28481,15 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { return R.getValue(1); } -/// Convert a SEXT of a vector to a SIGN_EXTEND_VECTOR_INREG, this requires -/// the splitting (or concatenating with UNDEFs) of the input to vectors of the -/// same size as the target type which then extends the lowest elements. +/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or +/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating +/// with UNDEFs) of the input to vectors of the same size as the target type +/// which then extends the lowest elements. static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - if (N->getOpcode() != ISD::SIGN_EXTEND) + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); @@ -28508,6 +28510,12 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); + // On AVX2+ targets, if the input/output types are both legal then we will be + // able to use SIGN_EXTEND/ZERO_EXTEND directly. + if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + SDLoc DL(N); auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { @@ -28527,20 +28535,22 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, EVT ExVT = EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); - SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); + SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, DAG.getIntPtrConstant(0, DL)); } // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to - // ISD::SIGN_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VSEXT. + // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. if (VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - return DAG.getSignExtendVectorInReg(ExOp, DL, VT); + return Opcode == ISD::SIGN_EXTEND + ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) + : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); } // On pre-AVX2 targets, split into 128-bit nodes of - // ISD::SIGN_EXTEND_VECTOR_INREG. + // ISD::*_EXTEND_VECTOR_INREG. if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { unsigned NumVecs = VT.getSizeInBits() / 128; unsigned NumSubElts = 128 / SVT.getSizeInBits(); @@ -28552,7 +28562,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); SrcVec = ExtendVecSize(DL, SrcVec, 128); - SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); + SrcVec = Opcode == ISD::SIGN_EXTEND + ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) + : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); @@ -28671,6 +28683,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, } } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; diff --git a/llvm/test/CodeGen/X86/pr26870.ll b/llvm/test/CodeGen/X86/pr26870.ll new file mode 100644 index 0000000000000..2731ed2d01258 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr26870.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=i686-pc-windows-msvc18.0.0 -mcpu=pentium4 + +define x86_thiscallcc i32* @fn4(i32* %this, i8* dereferenceable(1) %p1) { +entry: + %DL = getelementptr inbounds i32, i32* %this, i32 0 + %call.i = tail call x86_thiscallcc i64 @fn1(i32* %DL) + %getTypeAllocSize___trans_tmp_2.i = getelementptr inbounds i32, i32* %this, i32 0 + %0 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4 + %call.i8 = tail call x86_thiscallcc i64 @fn1(i32* %DL) + %1 = insertelement <2 x i64> undef, i64 %call.i, i32 0 + %2 = insertelement <2 x i64> %1, i64 %call.i8, i32 1 + %3 = add nsw <2 x i64> %2, + %4 = sdiv <2 x i64> %3, + %5 = add nsw <2 x i64> %4, + %6 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4 + %7 = insertelement <2 x i32> undef, i32 %0, i32 0 + %8 = insertelement <2 x i32> %7, i32 %6, i32 1 + %9 = zext <2 x i32> %8 to <2 x i64> + %10 = srem <2 x i64> %5, %9 + %11 = sub <2 x i64> %5, %10 + %12 = trunc <2 x i64> %11 to <2 x i32> + %13 = extractelement <2 x i32> %12, i32 0 + %14 = extractelement <2 x i32> %12, i32 1 + %cmp = icmp eq i32 %13, %14 + br i1 %cmp, label %if.then, label %cleanup + +if.then: + %call4 = tail call x86_thiscallcc i32* @fn3(i8* nonnull %p1) + br label %cleanup + +cleanup: + %retval.0 = phi i32* [ %call4, %if.then ], [ undef, %entry ] + ret i32* %retval.0 +} + +declare x86_thiscallcc i32* @fn3(i8*) +declare x86_thiscallcc i64 @fn1(i32*) diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 42924069d9e62..bc56ae7ae0731 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1428,11 +1428,10 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { ; ; AVX1-LABEL: uitofp_16i8_to_4f32: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1752,18 +1751,16 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { ; ; AVX1-LABEL: uitofp_8i8_to_8f32: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_8i8_to_8f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: retq %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -1786,11 +1783,10 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { ; ; AVX1-LABEL: uitofp_16i8_to_8f32: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index a3c567de652a5..ec1a4ecdc55c4 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -143,23 +143,20 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_16i8_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_8i32: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_16i8_to_8i32: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -225,23 +222,20 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_16i8_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_4i64: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_16i8_to_4i64: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> @@ -385,25 +379,20 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_8i16_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_4i64: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_8i16_to_4i64: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32>