Skip to content

Commit

Permalink
[DAG] BUILD_VECTOR: absorb ZERO_EXTEND of a single first operand if a…
Browse files Browse the repository at this point in the history
…ll other ops are zeros

This kind of pattern seems to come up as regressions
with better ZERO_EXTEND_VECTOR_INREG recognition.

For initial implementation, this is quite restricted
to the minimal viable transform, otherwise there are
too many regressions to be dealt with.
  • Loading branch information
LebedevRI committed Dec 30, 2022
1 parent 6bb4b2d commit e4d25a9
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 28 deletions.
115 changes: 115 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -612,6 +612,7 @@ namespace {
SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);
SDValue convertBuildVecZextToZext(SDNode *N);
SDValue convertBuildVecZextToBuildVecWithZeros(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecTruncToBitCast(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);
Expand Down Expand Up @@ -21451,6 +21452,117 @@ SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
VT, In);
}

// If this is a very simple BUILD_VECTOR with first element being a ZERO_EXTEND,
// and all other elements being constant zero's, granularize the BUILD_VECTOR's
// element width, absorbing the ZERO_EXTEND, turning it into a constant zero op.
// This patten can appear during legalization.
//
// NOTE: This can be generalized to allow more than a single
// non-constant-zero op, UNDEF's, and to be KnownBits-based,
SDValue DAGCombiner::convertBuildVecZextToBuildVecWithZeros(SDNode *N) {
// Don't run this after legalization. Targets may have other preferences.
if (Level >= AfterLegalizeDAG)
return SDValue();

// FIXME: support big-endian.
if (DAG.getDataLayout().isBigEndian())
return SDValue();

EVT VT = N->getValueType(0);
EVT OpVT = N->getOperand(0).getValueType();
assert(!VT.isScalableVector() && "Encountered scalable BUILD_VECTOR?");

unsigned EltBitwidth = VT.getScalarSizeInBits();
// NOTE: the actual width of operands may be wider than that!

// Analyze all operands of this BUILD_VECTOR. What is the largest number of
// active bits they all have? We'll want to truncate them all to that width.
unsigned ActiveBits = 0;
APInt KnownZeroOps(VT.getVectorNumElements(), 0);
for (auto I : enumerate(N->ops())) {
SDValue Op = I.value();
// FIXME: support UNDEF elements?
if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
unsigned OpActiveBits =
Cst->getAPIntValue().trunc(EltBitwidth).getActiveBits();
if (OpActiveBits == 0) {
KnownZeroOps.setBit(I.index());
continue;
}
// Profitability check: don't allow non-zero constant operands.
return SDValue();
}
// Profitability check: there must only be a single non-zero operand,
// and it must be the first operand of the BUILD_VECTOR.
if (I.index() != 0)
return SDValue();
// The operand must be a zero-extension itself.
// FIXME: this could be generalized to known leading zeros check.
if (Op.getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
unsigned CurrActiveBits =
Op.getOperand(0).getValueSizeInBits().getFixedSize();
assert(!ActiveBits && "Already encountered non-constant-zero operand?");
ActiveBits = CurrActiveBits;
// We want to at least halve the element size.
if (2 * ActiveBits > EltBitwidth)
return SDValue();
}

// This BUILD_VECTOR must have at least one non-constant-zero operand.
if (ActiveBits == 0)
return SDValue();

// We have EltBitwidth bits, the *minimal* chunk size is ActiveBits,
// into how many chunks can we split our element width?
unsigned Factor = divideCeil(EltBitwidth, ActiveBits);
assert(Factor > 1 && "Did not split the element after all?");
assert(EltBitwidth % Factor == 0 && "Can not split into this many chunks?");
unsigned ChunkBitwidth = EltBitwidth / Factor;
assert(ChunkBitwidth >= ActiveBits && "Underestimated chunk size?");
assert(ChunkBitwidth < EltBitwidth && "Failed to reduce element width?");

EVT OpIntVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
EVT NewScalarIntVT = EVT::getIntegerVT(*DAG.getContext(), ChunkBitwidth);
EVT NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewScalarIntVT,
Factor * N->getNumOperands());

// Never create illegal types.
if (!TLI.isTypeLegal(OpIntVT) || !TLI.isTypeLegal(NewScalarIntVT) ||
!TLI.isTypeLegal(NewIntVT))
return SDValue();

if (LegalOperations &&
!(TLI.isOperationLegalOrCustom(ISD::BITCAST, OpIntVT) &&
TLI.isOperationLegalOrCustom(ISD::TRUNCATE, NewScalarIntVT) &&
TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, NewIntVT)))
return SDValue();

SDLoc DL(N);
SDValue ZeroOp = DAG.getConstant(0, DL, NewScalarIntVT);

// Recreate the BUILD_VECTOR, with elements now being Factor times smaller.
SmallVector<SDValue, 16> NewOps;
NewOps.reserve(NewIntVT.getVectorNumElements());
for (auto I : enumerate(N->ops())) {
SDValue Op = I.value();
// FIXME: after allowing UNDEF's, do handle them here.
unsigned SrcOpIdx = I.index();
if (KnownZeroOps[SrcOpIdx]) {
NewOps.append(Factor, ZeroOp);
continue;
}
Op = DAG.getBitcast(OpIntVT, Op);
Op = DAG.getNode(ISD::TRUNCATE, DL, NewScalarIntVT, Op);
NewOps.emplace_back(Op);
NewOps.append(Factor - 1, ZeroOp);
}
assert(NewOps.size() == NewIntVT.getVectorNumElements());
SDValue NewBV = DAG.getBuildVector(NewIntVT, DL, NewOps);
NewBV = DAG.getBitcast(VT, NewBV);
return NewBV;
}

SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
EVT VT = N->getValueType(0);

Expand Down Expand Up @@ -21516,6 +21628,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
if (SDValue V = convertBuildVecZextToZext(N))
return V;

if (SDValue V = convertBuildVecZextToBuildVecWithZeros(N))
return V;

if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
return V;

Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AArch64/build-vector-extract.ll
Expand Up @@ -16,8 +16,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract0_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: mov v1.d[0], x8
; CHECK-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 0
Expand All @@ -42,8 +41,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: mov v1.d[0], x8
; CHECK-NEXT: mov v1.s[0], v0.s[1]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
Expand All @@ -68,8 +66,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract2_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, v0.s[2]
; CHECK-NEXT: mov v1.d[0], x8
; CHECK-NEXT: mov v1.s[0], v0.s[2]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 2
Expand All @@ -94,8 +91,7 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract3_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, v0.s[3]
; CHECK-NEXT: mov v1.d[0], x8
; CHECK-NEXT: mov v1.s[0], v0.s[3]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 3
Expand Down
39 changes: 19 additions & 20 deletions llvm/test/CodeGen/X86/buildvec-extract.ll
Expand Up @@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero:
Expand Down Expand Up @@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero:
Expand Down Expand Up @@ -375,8 +375,7 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
Expand Down Expand Up @@ -417,14 +416,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract1_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
; SSE-NEXT: pextrw $1, %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpextrw $1, %xmm0, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
Expand Down Expand Up @@ -453,14 +452,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract2_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
; SSE-NEXT: pextrw $2, %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpextrw $2, %xmm0, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
Expand All @@ -487,14 +486,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract3_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
; SSE-NEXT: pextrw $3, %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpextrw $3, %xmm0, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64
Expand Down

0 comments on commit e4d25a9

Please sign in to comment.