-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve minbitwidth analysis. #78976
[SLP]Improve minbitwidth analysis. #78976
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesThis improves overall analysis for minbitwidth in SLP. It allows to Metric: size..text Program size..text
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar Patch is 63.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78976.diff 13 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 482970bbf306120..5a493aed0fed7b8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2273,9 +2273,11 @@ class BoUpSLP {
/// constant and to be demoted. Required to correctly identify constant nodes
/// to be demoted.
bool collectValuesToDemote(
- Value *V, SmallVectorImpl<Value *> &ToDemote,
+ Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+ DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote) const;
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
@@ -7862,7 +7864,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
unsigned VecOpcode;
- auto *SrcVecTy =
+ auto *UserVecTy =
FixedVectorType::get(UserScalarTy, E->getVectorFactor());
if (BWSz > SrcBWSz)
VecOpcode = Instruction::Trunc;
@@ -7870,11 +7872,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecOpcode =
It->second.second ? Instruction::SExt : Instruction::ZExt;
TTI::CastContextHint CCH = GetCastContextHint(VL0);
- VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
+ VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
CostKind);
- ScalarCost +=
- Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,
- CCH, CostKind);
+ ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
+ ScalarTy, CCH, CostKind);
}
}
}
@@ -8955,7 +8956,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
- DenseSet<Value *> VectorCasts;
+ DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -9025,7 +9026,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
auto It = MinBWs.find(ScalarTE);
- if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {
+ if (It != MinBWs.end() &&
+ VectorCasts
+ .insert(std::make_pair(ScalarTE, FTy->getElementType()))
+ .second) {
unsigned BWSz = It->second.second;
unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());
unsigned VecOpcode;
@@ -9082,17 +9086,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
- auto BWIt = MinBWs.find(VectorizableTree.front().get());
+ const TreeEntry &Root = *VectorizableTree.front().get();
+ auto BWIt = MinBWs.find(&Root);
if (BWIt != MinBWs.end()) {
- Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();
+ Type *DstTy = Root.Scalars.front()->getType();
unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
- unsigned Opcode = Instruction::Trunc;
- if (OriginalSz < BWIt->second.first)
- Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
- Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
- Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
- TTI::CastContextHint::None,
- TTI::TCK_RecipThroughput);
+ if (OriginalSz != BWIt->second.first) {
+ unsigned Opcode = Instruction::Trunc;
+ if (OriginalSz < BWIt->second.first)
+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+ TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput);
+ }
}
}
@@ -11383,9 +11390,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
VecOpcode = Instruction::BitCast;
} else if (BWSz < SrcBWSz) {
VecOpcode = Instruction::Trunc;
- } else if (It != MinBWs.end()) {
+ } else if (SrcIt != MinBWs.end()) {
assert(BWSz > SrcBWSz && "Invalid cast!");
- VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ VecOpcode =
+ SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
}
}
Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
@@ -11893,7 +11901,7 @@ Value *BoUpSLP::vectorizeTree(
// basic block. Only one extractelement per block should be emitted.
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
SmallDenseSet<Value *, 4> UsedInserts;
- DenseMap<Value *, Value *> VectorCasts;
+ DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
@@ -12014,7 +12022,9 @@ Value *BoUpSLP::vectorizeTree(
// Need to use original vector, if the root is truncated.
auto BWIt = MinBWs.find(E);
if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
- auto VecIt = VectorCasts.find(Scalar);
+ auto *ScalarTy = FTy->getElementType();
+ auto Key = std::make_pair(Vec, ScalarTy);
+ auto VecIt = VectorCasts.find(Key);
if (VecIt == VectorCasts.end()) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (auto *IVec = dyn_cast<Instruction>(Vec))
@@ -12022,10 +12032,10 @@ Value *BoUpSLP::vectorizeTree(
Vec = Builder.CreateIntCast(
Vec,
FixedVectorType::get(
- cast<VectorType>(VU->getType())->getElementType(),
+ ScalarTy,
cast<FixedVectorType>(Vec->getType())->getNumElements()),
BWIt->second.second);
- VectorCasts.try_emplace(Scalar, Vec);
+ VectorCasts.try_emplace(Key, Vec);
} else {
Vec = VecIt->second;
}
@@ -13095,16 +13105,21 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
bool BoUpSLP::collectValuesToDemote(
- Value *V, SmallVectorImpl<Value *> &ToDemote,
+ Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
+ DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote) const {
// We can always demote constants.
- if (isa<Constant>(V))
+ if (isa<Constant>(V)) {
+ MaxDepthLevel = 1;
return true;
+ }
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
+ // TODO: improve handling of gathered values and others.
auto *I = dyn_cast<Instruction>(V);
if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
!Visited.insert(I).second || all_of(I->users(), [&](User *U) {
@@ -13112,6 +13127,21 @@ bool BoUpSLP::collectValuesToDemote(
}))
return false;
+ auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
+ if (MultiNodeScalars.contains(V))
+ return false;
+ uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
+ APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
+ return true;
+ auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+ unsigned BitWidth1 = OrigBitWidth - NumSignBits;
+ KnownBits Known = computeKnownBits(V, *DL);
+ if (!Known.isNonNegative())
+ ++BitWidth1;
+ BitWidth = std::max(BitWidth, BitWidth1);
+ return BitWidth > 0 && OrigBitWidth / BitWidth > 1;
+ };
unsigned Start = 0;
unsigned End = I->getNumOperands();
switch (I->getOpcode()) {
@@ -13119,12 +13149,16 @@ bool BoUpSLP::collectValuesToDemote(
// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.
case Instruction::Trunc:
- Roots.push_back(I->getOperand(0));
+ MaxDepthLevel = 1;
+ if (IsProfitableToDemoteRoot)
+ IsProfitableToDemote = true;
break;
case Instruction::ZExt:
case Instruction::SExt:
- if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
- return false;
+ MaxDepthLevel = 1;
+ if (isa<InsertElementInst>(I->getOperand(0)))
+ return true;
+ IsProfitableToDemote = true;
break;
// We can demote certain binary operations if we can demote both of their
@@ -13134,23 +13168,32 @@ bool BoUpSLP::collectValuesToDemote(
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor:
- if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
- Visited) ||
- !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
- Visited))
+ case Instruction::Xor: {
+ unsigned Level1, Level2;
+ if (!collectValuesToDemote(I->getOperand(0), IsProfitableToDemoteRoot,
+ BitWidth, ToDemote, DemotedConsts, Visited,
+ Level1, IsProfitableToDemote) ||
+ !collectValuesToDemote(I->getOperand(1), IsProfitableToDemoteRoot,
+ BitWidth, ToDemote, DemotedConsts, Visited,
+ Level2, IsProfitableToDemote))
return false;
+ MaxDepthLevel = std::max(Level1, Level2);
break;
+ }
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
Start = 1;
+ unsigned Level1, Level2;
SelectInst *SI = cast<SelectInst>(I);
- if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
- Roots, Visited) ||
- !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
- Roots, Visited))
+ if (!collectValuesToDemote(SI->getTrueValue(), IsProfitableToDemoteRoot,
+ BitWidth, ToDemote, DemotedConsts, Visited,
+ Level1, IsProfitableToDemote) ||
+ !collectValuesToDemote(SI->getFalseValue(), IsProfitableToDemoteRoot,
+ BitWidth, ToDemote, DemotedConsts, Visited,
+ Level2, IsProfitableToDemote))
return false;
+ MaxDepthLevel = std::max(Level1, Level2);
break;
}
@@ -13159,157 +13202,252 @@ bool BoUpSLP::collectValuesToDemote(
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
- if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
- Visited))
+ if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
+ ToDemote, DemotedConsts, Visited,
+ MaxDepthLevel, IsProfitableToDemote))
return false;
break;
}
// Otherwise, conservatively give up.
default:
- return false;
+ if (!IsPotentiallyTruncated(I, BitWidth))
+ return false;
+ MaxDepthLevel = 0;
+ Start = End = 0;
+ break;
}
+ ++MaxDepthLevel;
// Gather demoted constant operands.
for (unsigned Idx : seq<unsigned>(Start, End))
if (isa<Constant>(I->getOperand(Idx)))
DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
// Record the value that we can demote.
ToDemote.push_back(V);
- return true;
+ return IsProfitableToDemote;
}
void BoUpSLP::computeMinimumValueSizes() {
// We only attempt to truncate integer expressions.
- auto &TreeRoot = VectorizableTree[0]->Scalars;
- auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
- if (!TreeRootIT)
- return;
+ bool IsStoreOrInsertElt =
+ VectorizableTree.front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+ unsigned NodeIdx = 0;
+ if (IsStoreOrInsertElt &&
+ VectorizableTree.front()->State != TreeEntry::NeedToGather)
+ NodeIdx = 1;
// Ensure the roots of the vectorizable tree don't form a cycle.
- if (!VectorizableTree.front()->UserTreeIndices.empty())
+ if ((NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
+ (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+ [&](const EdgeInfo &EI) {
+ return EI.UserTE->Idx >
+ static_cast<int>(NodeIdx);
+ })))
return;
- // Conservatively determine if we can actually truncate the roots of the
- // expression. Collect the values that can be demoted in ToDemote and
- // additional roots that require investigating in Roots.
- SmallVector<Value *, 32> ToDemote;
- DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
- SmallVector<Value *, 4> Roots;
- for (auto *Root : TreeRoot) {
- DenseSet<Value *> Visited;
- if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
- return;
+ // The first value node for store/insertelement is sext/zext/trunc? Skip it,
+ // resize to the final type.
+ bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
+ if (NodeIdx != 0 &&
+ VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
+ (VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt ||
+ VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt ||
+ VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) {
+ assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
+ ++NodeIdx;
+ IsProfitableToDemoteRoot = true;
}
- // The maximum bit width required to represent all the values that can be
- // demoted without loss of precision. It would be safe to truncate the roots
- // of the expression to this width.
- auto MaxBitWidth = 1u;
-
- // We first check if all the bits of the roots are demanded. If they're not,
- // we can truncate the roots to this narrower type.
- for (auto *Root : TreeRoot) {
- auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
- MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
- MaxBitWidth);
- }
-
- // True if the roots can be zero-extended back to their original type, rather
- // than sign-extended. We know that if the leading bits are not demanded, we
- // can safely zero-extend. So we initialize IsKnownPositive to True.
- bool IsKnownPositive = true;
-
- // If all the bits of the roots are demanded, we can try a little harder to
- // compute a narrower type. This can happen, for example, if the roots are
- // getelementptr indices. InstCombine promotes these indices to the pointer
- // width. Thus, all their bits are technically demanded even though the
- // address computation might be vectorized in a smaller type.
- //
- // We start by looking at each entry that can be demoted. We compute the
- // maximum bit width required to store the scalar by using ValueTracking to
- // compute the number of high-order bits we can truncate.
- if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
- all_of(TreeRoot, [](Value *V) {
- return all_of(V->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); });
- })) {
- MaxBitWidth = 8u;
-
+ SmallVector<Value *> ToDemote;
+ DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
+ auto ComputeMaxBitWidth = [&](ArrayRef<Value *> TreeRoot, unsigned VF,
+ bool IsTopRoot, bool IsProfitableToDemoteRoot,
+ unsigned Opcode, unsigned Limit) {
+ ToDemote.clear();
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+ if (!TreeRootIT || !Opcode)
+ return 0u;
+
+ unsigned NumParts = TTI->getNumberOfParts(
+ FixedVectorType::get(TreeRoot.front()->getType(), VF));
+
+ // The maximum bit width required to represent all the values that can be
+ // demoted without loss of precision. It would be safe to truncate the roots
+ // of the expression to this width.
+ auto MaxBitWidth = 1u;
+
+ // True if the roots can be zero-extended back to their original type,
+ // rather than sign-extended. We know that if the leading bits are not
+ // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
+ // True.
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
- IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+ bool IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
KnownBits Known = computeKnownBits(R, *DL);
return Known.isNonNegative();
});
- // Determine the maximum number of bits required to store the scalar
- // values.
- for (auto *Scalar : ToDemote) {
- auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
- auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
- MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
- }
-
- // If we can't prove that the sign bit is zero, we must add one to the
- // maximum bit width to account for the unknown sign bit. This preserves
- // the existing sign bit so we can safely sign-extend the root back to the
- // original type. Otherwise, if we know the sign bit is zero, we will
- // zero-extend the root instead.
- //
- // FIXME: This is somewhat suboptimal, as there will be cases where adding
- // one to the maximum bit width will yield a larger-than-necessary
- // type. In general, we need to add an extra bit only if we can't
- // prove that the upper bit of the original type is equal to the
- // upper bit of the proposed smaller type. If these two bits are the
- // same (either zero or one) we know that sign-extending from the
- // smaller type will result in the same value. Here, since we can't
- // yet prove this, we are just making the proposed smaller type
- // larger to ensure correctness.
- if (!IsKnownPositive)
- ++MaxBitWidth;
- }
-
- // Round MaxBitWidth up to the next power-of-two.
- MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
-
- // If the maximum bit width we compute is less than the with of the roots'
- // type, we can proceed with the narrowing. Otherwise, do nothing.
- if (MaxBitWidth >= TreeRootIT->getBitWidth())
- return;
+ // We first check if all the bits of the roots are demanded. If they'r...
[truncated]
|
Created using spr 1.3.5
Created using spr 1.3.5
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping!
Created using spr 1.3.5
return true; | ||
auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); | ||
unsigned BitWidth1 = OrigBitWidth - NumSignBits; | ||
KnownBits Known = computeKnownBits(V, *DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Optimization) - You're calling computeKnownBits here after MaskedValueIsZero has already done that above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do not how to reuse the result from MaskedValueIsZero for checking that V is potentially negative here. Anyway, replaced by llvm::isKnownNonNegative function
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be able to replace MaskedValueIsZero with something like:
KnownBits Known = computeKnownBits(V, *DL);
if (Known.countMinLeadingZeros() >= (OrigBitWidth - NumSignBits))
return true;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be able to replace MaskedValueIsZero with something like:
KnownBits Known = computeKnownBits(V, *DL); if (Known.countMinLeadingZeros() >= (OrigBitWidth - NumSignBits)) return true;
Not sure this is better. Prefer to use standard functions rather than reinvent them here. Such kind of "manual inlining" is very to hard to maintain.
Created using spr 1.3.5
Created using spr 1.3.5
✅ With the latest revision this PR passed the C/C++ code formatter. |
Ping! |
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - as far as I can tell - the AArch64/getelementptr.ll code explosion seems unfortunate though - have you confirmed it results in better codegen?
AArch64/getelementptr.ll is not a regression. It uses -slp-threshold=-7, which turns on the vectorization of previously non-profitable graph with this patch. Need to set -slp-threshold=-6 (or even -5) to return it back, but in this case the first test will be failed (unaffected by this patch). I'll split this test into 2 separate test with the different thresholds. |
Seems to break the build, maybe on GCC only?
|
Am, I did not touch this code. Probably, some kind of side effect. Could you provide full error report? |
https://github.com/dtcxzyw/llvm-ci/actions/runs/8160865465/job/22308394053
Related commits:
|
Hope fixed in 083d8aa |
It looks like this change causes large compile-time regressions in some cases: http://llvm-compile-time-tracker.com/compare.php?from=1b1aea79194117d8f1729ef9c8f80454aea381fe&to=083d8aa03aca55b88098a91e41e41a8e321a5721&stat=instructions%3Au For example, mode_decision.c from lencod regresses by 10% with Other examples are partQalignmm.c from mafft by 3.5%, shared_sha256.c from ClamAV by 5%. |
I'll revert the patch and will try to improve the compile time |
I see it was already reverted. But prior to that, this change has also triggered a miscompile in xxhash, with llvm/unittests/Support/xxhashTest.cpp failing to compute correct hashes for 6 of its test-cases. (This error showed up in our downstream build, which is building llvm for x86-64, but uses various cpu tuning and other build flags. I don't know if any upstream buildbots observed the bug.)
|
This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.