diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e6f8b98e9c8e42..ee2918e419404d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -612,6 +612,7 @@ namespace { SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); SDValue convertBuildVecZextToZext(SDNode *N); + SDValue convertBuildVecZextToBuildVecWithZeros(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecTruncToBitCast(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); @@ -21451,6 +21452,117 @@ SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { VT, In); } +// If this is a very simple BUILD_VECTOR with first element being a ZERO_EXTEND, +// and all other elements being constant zero's, granularize the BUILD_VECTOR's +// element width, absorbing the ZERO_EXTEND, turning it into a constant zero op. +// This patten can appear during legalization. +// +// NOTE: This can be generalized to allow more than a single +// non-constant-zero op, UNDEF's, and to be KnownBits-based, +SDValue DAGCombiner::convertBuildVecZextToBuildVecWithZeros(SDNode *N) { + // Don't run this after legalization. Targets may have other preferences. + if (Level >= AfterLegalizeDAG) + return SDValue(); + + // FIXME: support big-endian. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT OpVT = N->getOperand(0).getValueType(); + assert(!VT.isScalableVector() && "Encountered scalable BUILD_VECTOR?"); + + unsigned EltBitwidth = VT.getScalarSizeInBits(); + // NOTE: the actual width of operands may be wider than that! + + // Analyze all operands of this BUILD_VECTOR. What is the largest number of + // active bits they all have? We'll want to truncate them all to that width. + unsigned ActiveBits = 0; + APInt KnownZeroOps(VT.getVectorNumElements(), 0); + for (auto I : enumerate(N->ops())) { + SDValue Op = I.value(); + // FIXME: support UNDEF elements? + if (auto *Cst = dyn_cast(Op)) { + unsigned OpActiveBits = + Cst->getAPIntValue().trunc(EltBitwidth).getActiveBits(); + if (OpActiveBits == 0) { + KnownZeroOps.setBit(I.index()); + continue; + } + // Profitability check: don't allow non-zero constant operands. + return SDValue(); + } + // Profitability check: there must only be a single non-zero operand, + // and it must be the first operand of the BUILD_VECTOR. + if (I.index() != 0) + return SDValue(); + // The operand must be a zero-extension itself. + // FIXME: this could be generalized to known leading zeros check. + if (Op.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + unsigned CurrActiveBits = + Op.getOperand(0).getValueSizeInBits().getFixedSize(); + assert(!ActiveBits && "Already encountered non-constant-zero operand?"); + ActiveBits = CurrActiveBits; + // We want to at least halve the element size. + if (2 * ActiveBits > EltBitwidth) + return SDValue(); + } + + // This BUILD_VECTOR must have at least one non-constant-zero operand. + if (ActiveBits == 0) + return SDValue(); + + // We have EltBitwidth bits, the *minimal* chunk size is ActiveBits, + // into how many chunks can we split our element width? + unsigned Factor = divideCeil(EltBitwidth, ActiveBits); + assert(Factor > 1 && "Did not split the element after all?"); + assert(EltBitwidth % Factor == 0 && "Can not split into this many chunks?"); + unsigned ChunkBitwidth = EltBitwidth / Factor; + assert(ChunkBitwidth >= ActiveBits && "Underestimated chunk size?"); + assert(ChunkBitwidth < EltBitwidth && "Failed to reduce element width?"); + + EVT OpIntVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); + EVT NewScalarIntVT = EVT::getIntegerVT(*DAG.getContext(), ChunkBitwidth); + EVT NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewScalarIntVT, + Factor * N->getNumOperands()); + + // Never create illegal types. + if (!TLI.isTypeLegal(OpIntVT) || !TLI.isTypeLegal(NewScalarIntVT) || + !TLI.isTypeLegal(NewIntVT)) + return SDValue(); + + if (LegalOperations && + !(TLI.isOperationLegalOrCustom(ISD::BITCAST, OpIntVT) && + TLI.isOperationLegalOrCustom(ISD::TRUNCATE, NewScalarIntVT) && + TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, NewIntVT))) + return SDValue(); + + SDLoc DL(N); + SDValue ZeroOp = DAG.getConstant(0, DL, NewScalarIntVT); + + // Recreate the BUILD_VECTOR, with elements now being Factor times smaller. + SmallVector NewOps; + NewOps.reserve(NewIntVT.getVectorNumElements()); + for (auto I : enumerate(N->ops())) { + SDValue Op = I.value(); + // FIXME: after allowing UNDEF's, do handle them here. + unsigned SrcOpIdx = I.index(); + if (KnownZeroOps[SrcOpIdx]) { + NewOps.append(Factor, ZeroOp); + continue; + } + Op = DAG.getBitcast(OpIntVT, Op); + Op = DAG.getNode(ISD::TRUNCATE, DL, NewScalarIntVT, Op); + NewOps.emplace_back(Op); + NewOps.append(Factor - 1, ZeroOp); + } + assert(NewOps.size() == NewIntVT.getVectorNumElements()); + SDValue NewBV = DAG.getBuildVector(NewIntVT, DL, NewOps); + NewBV = DAG.getBitcast(VT, NewBV); + return NewBV; +} + SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); @@ -21516,6 +21628,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (SDValue V = convertBuildVecZextToZext(N)) return V; + if (SDValue V = convertBuildVecZextToBuildVecWithZeros(N)) + return V; + if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll index 7b60a398fa7b60..53e8b568f70963 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -16,8 +16,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v1.s[0], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 @@ -42,8 +41,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v1.s[0], v0.s[1] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 @@ -68,8 +66,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov w8, v0.s[2] -; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v1.s[0], v0.s[2] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 @@ -94,8 +91,7 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract3_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v1.s[0], v0.s[3] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 3 diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index ebd027b979e737..e63d6249991fe8 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero: @@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: @@ -375,8 +375,7 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero: @@ -417,14 +416,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract1_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -453,14 +452,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract2_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -487,14 +486,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract3_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64