diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8cc6a28fadaccb..539d2f25a0cf25 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6322,7 +6322,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, // Helper function to collect subvector ops that are concatenated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. -static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { +static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops, + SelectionDAG &DAG) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { @@ -6338,21 +6339,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { EVT SubVT = Sub.getValueType(); // TODO - Handle more general insert_subvector chains. - if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && - Idx == (VT.getVectorNumElements() / 2)) { - // insert_subvector(insert_subvector(undef, x, lo), y, hi) - if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && - Src.getOperand(1).getValueType() == SubVT && - isNullConstant(Src.getOperand(2))) { - Ops.push_back(Src.getOperand(1)); + if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) { + // insert_subvector(undef, x, lo) + if (Idx == 0 && Src.isUndef()) { Ops.push_back(Sub); + Ops.push_back(DAG.getUNDEF(SubVT)); return true; } - // insert_subvector(x, extract_subvector(x, lo), hi) - if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && - Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { - Ops.append(2, Sub); - return true; + if (Idx == (VT.getVectorNumElements() / 2)) { + // insert_subvector(insert_subvector(undef, x, lo), y, hi) + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueType() == SubVT && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + // insert_subvector(x, extract_subvector(x, lo), hi) + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { + Ops.append(2, Sub); + return true; + } + // insert_subvector(undef, x, hi) + if (Src.isUndef()) { + Ops.push_back(DAG.getUNDEF(SubVT)); + Ops.push_back(Sub); + return true; + } } } } @@ -6811,7 +6825,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { } } SmallVector CatOps; - if (collectConcatOps(V.getNode(), CatOps)) { + if (collectConcatOps(V.getNode(), CatOps, DAG)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); @@ -25278,7 +25292,8 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && !Subtarget.hasBWI())) { SmallVector CatOps; - if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) + if (StoredVal.hasOneUse() && + collectConcatOps(StoredVal.getNode(), CatOps, DAG)) return splitVectorStore(St, DAG); return SDValue(); } @@ -39744,7 +39759,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); SmallVector SubOps; - if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2) + if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2) return SubOps[Idx & 1]; unsigned NumElts = Src.getValueType().getVectorNumElements(); if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && @@ -43724,8 +43739,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, SDValue FVal = N->getOperand(2); SmallVector CatOpsT, CatOpsF; if (!TVal.hasOneUse() || !FVal.hasOneUse() || - !collectConcatOps(TVal.getNode(), CatOpsT) || - !collectConcatOps(FVal.getNode(), CatOpsF)) + !collectConcatOps(TVal.getNode(), CatOpsT, DAG) || + !collectConcatOps(FVal.getNode(), CatOpsF, DAG)) return SDValue(); auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, @@ -45048,7 +45063,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) { SmallVector Ops; - if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) && + if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) && Ops.size() == 2) { SDLoc DL(EFLAGS); EVT SubVT = Ops[0].getValueType().changeTypeToInteger(); @@ -49683,7 +49698,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, // PACK should still be worth it for 128-bit vectors if the sources were // originally concatenated from subvectors. SmallVector ConcatOps; - if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps)) + if (VT.getSizeInBits() > 128 || + !collectConcatOps(In.getNode(), ConcatOps, DAG)) return SDValue(); } @@ -53607,7 +53623,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // Match concat_vector style patterns. SmallVector SubVectorOps; - if (collectConcatOps(N, SubVectorOps)) { + if (collectConcatOps(N, SubVectorOps, DAG)) { if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; @@ -53669,7 +53685,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); SmallVector CatOps; if (Sel.getOpcode() != ISD::VSELECT || - !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) + !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG)) return SDValue(); // Note: We assume simple value types because this should only be called with diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 34f6d9ffb67992..be99de22eeb308 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2430,14 +2430,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi) +; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi) ; AVX1-NEXT: vmovups %ymm1, 128(%rsi) -; AVX1-NEXT: vmovups %ymm7, 96(%rsi) -; AVX1-NEXT: vmovups %ymm6, 32(%rsi) ; AVX1-NEXT: vmovupd %ymm5, 192(%rsi) ; AVX1-NEXT: vmovups %ymm4, 224(%rsi) ; AVX1-NEXT: vmovups %ymm3, 160(%rsi) @@ -2461,11 +2458,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) { ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] @@ -2475,9 +2470,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) { ; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi) ; AVX2-NEXT: vmovdqu %ymm5, 224(%rsi) ; AVX2-NEXT: vmovdqu %ymm4, (%rsi) -; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) -; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi) -; AVX2-NEXT: vmovdqu %ymm1, 96(%rsi) +; AVX2-NEXT: vmovdqa %xmm3, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm2, 112(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2507,14 +2502,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) { ; XOP-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; XOP-NEXT: vmovups %ymm0, (%rsi) +; XOP-NEXT: vmovdqa %xmm0, 16(%rsi) +; XOP-NEXT: vmovdqa %xmm7, 112(%rsi) +; XOP-NEXT: vmovdqa %xmm6, 48(%rsi) ; XOP-NEXT: vmovups %ymm1, 128(%rsi) -; XOP-NEXT: vmovups %ymm7, 96(%rsi) -; XOP-NEXT: vmovups %ymm6, 32(%rsi) ; XOP-NEXT: vmovupd %ymm5, 192(%rsi) ; XOP-NEXT: vmovups %ymm4, 224(%rsi) ; XOP-NEXT: vmovups %ymm3, 160(%rsi) diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index fb4c23e7de1a41..cda2ab59655090 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -91,11 +91,11 @@ define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %stru ; ; AVX512-LABEL: PR40815: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX512-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX512-NEXT: vmovups 16(%rdi), %ymm1 +; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm1, %ymm1 +; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll index 666f72ba06cb79..da3e8a6ad12320 100644 --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -94,9 +94,7 @@ define void @test_vector_creation() nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rax) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: retq %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2 %2 = load double, double addrspace(1)* null