-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve minbitwidth analysis. #84536
[SLP]Improve minbitwidth analysis. #84536
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesThis improves overall analysis for minbitwidth in SLP. It allows to Metric: size..text Program size..text
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar Original Pull Request: #84334 The patch has the same functionality (no test changes, no changes in Patch is 71.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/84536.diff 15 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 36dc9094538ae9..a5c34bfbf9b4c3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1085,6 +1085,9 @@ class BoUpSLP {
BS->clear();
}
MinBWs.clear();
+ ReductionBitWidth = 0;
+ CastMaxMinBWSizes.reset();
+ TruncNodes.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
PostponedGathers.clear();
@@ -2287,6 +2290,7 @@ class BoUpSLP {
void clearReductionData() {
AnalyzedReductionsRoots.clear();
AnalyzedReductionVals.clear();
+ AnalyzedMinBWVals.clear();
}
/// Checks if the given value is gathered in one of the nodes.
bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
@@ -2307,9 +2311,11 @@ class BoUpSLP {
/// constant and to be demoted. Required to correctly identify constant nodes
/// to be demoted.
bool collectValuesToDemote(
- Value *V, SmallVectorImpl<Value *> &ToDemote,
+ Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+ DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote) const;
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
@@ -2375,6 +2381,10 @@ class BoUpSLP {
/// \ returns the graph entry for the \p Idx operand of the \p E entry.
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
+ /// \returns Cast context for the given graph node.
+ TargetTransformInfo::CastContextHint
+ getCastContextHint(const TreeEntry &TE) const;
+
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,
@@ -2925,11 +2935,18 @@ class BoUpSLP {
}
assert(!BundleMember && "Bundle and VL out of sync");
} else {
- MustGather.insert(VL.begin(), VL.end());
// Build a map for gathered scalars to the nodes where they are used.
+ bool AllConstsOrCasts = true;
for (Value *V : VL)
- if (!isConstant(V))
+ if (!isConstant(V)) {
+ auto *I = dyn_cast<CastInst>(V);
+ AllConstsOrCasts &= I && I->getType()->isIntegerTy();
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+ }
+ if (AllConstsOrCasts)
+ CastMaxMinBWSizes =
+ std::make_pair(std::numeric_limits<unsigned>::max(), 1);
+ MustGather.insert(VL.begin(), VL.end());
}
if (UserTreeIdx.UserTE)
@@ -3054,6 +3071,10 @@ class BoUpSLP {
/// Set of hashes for the list of reduction values already being analyzed.
DenseSet<size_t> AnalyzedReductionVals;
+ /// Values, already been analyzed for mininmal bitwidth and found to be
+ /// non-profitable.
+ DenseSet<Value *> AnalyzedMinBWVals;
+
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
@@ -3629,6 +3650,18 @@ class BoUpSLP {
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
+
+ /// Final size of the reduced vector, if the current graph represents the
+ /// input for the reduction and it was possible to narrow the size of the
+ /// reduction.
+ unsigned ReductionBitWidth = 0;
+
+ /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
+ /// type sizes, used in the tree.
+ std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
+
+ /// Indices of the vectorized trunc nodes.
+ DenseSet<unsigned> TruncNodes;
};
} // end namespace slpvectorizer
@@ -6539,8 +6572,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
+ auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
+ std::make_pair(std::numeric_limits<unsigned>::min(),
+ std::numeric_limits<unsigned>::max()));
+ if (ShuffleOrOp == Instruction::ZExt ||
+ ShuffleOrOp == Instruction::SExt) {
+ CastMaxMinBWSizes = std::make_pair(
+ std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+ PrevMaxBW),
+ std::min<unsigned>(
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+ PrevMinBW));
+ } else if (ShuffleOrOp == Instruction::Trunc) {
+ CastMaxMinBWSizes = std::make_pair(
+ std::max<unsigned>(
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+ PrevMaxBW),
+ std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+ PrevMinBW));
+ TruncNodes.insert(VectorizableTree.size());
+ }
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
+
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
TE->setOperandsInOrder();
@@ -8362,6 +8416,22 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
return It->get();
}
+TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
+ if (TE.State == TreeEntry::ScatterVectorize ||
+ TE.State == TreeEntry::StridedVectorize)
+ return TTI::CastContextHint::GatherScatter;
+ if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
+ !TE.isAltShuffle()) {
+ if (TE.ReorderIndices.empty())
+ return TTI::CastContextHint::Normal;
+ SmallVector<int> Mask;
+ inversePermutation(TE.ReorderIndices, Mask);
+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+ return TTI::CastContextHint::Reversed;
+ }
+ return TTI::CastContextHint::None;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -8384,6 +8454,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
auto It = MinBWs.find(E);
+ Type *OrigScalarTy = ScalarTy;
if (It != MinBWs.end()) {
ScalarTy = IntegerType::get(F->getContext(), It->second.first);
VecTy = FixedVectorType::get(ScalarTy, VL.size());
@@ -8441,24 +8512,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
UsedScalars.set(I);
}
auto GetCastContextHint = [&](Value *V) {
- if (const TreeEntry *OpTE = getTreeEntry(V)) {
- if (OpTE->State == TreeEntry::ScatterVectorize ||
- OpTE->State == TreeEntry::StridedVectorize)
- return TTI::CastContextHint::GatherScatter;
- if (OpTE->State == TreeEntry::Vectorize &&
- OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
- if (OpTE->ReorderIndices.empty())
- return TTI::CastContextHint::Normal;
- SmallVector<int> Mask;
- inversePermutation(OpTE->ReorderIndices, Mask);
- if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
- return TTI::CastContextHint::Reversed;
- }
- } else {
- InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
- if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
- return TTI::CastContextHint::GatherScatter;
- }
+ if (const TreeEntry *OpTE = getTreeEntry(V))
+ return getCastContextHint(*OpTE);
+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+ return TTI::CastContextHint::GatherScatter;
return TTI::CastContextHint::None;
};
auto GetCostDiff =
@@ -8507,8 +8565,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI::CastContextHint CCH = GetCastContextHint(VL0);
VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
CostKind);
- ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
- ScalarTy, CCH, CostKind);
}
}
}
@@ -8525,7 +8581,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
- *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
+ *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
"Calculated GEPs cost for Tree"));
@@ -8572,7 +8628,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
NumElts = ATy->getNumElements();
else
NumElts = AggregateTy->getStructNumElements();
- SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
+ SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
}
if (I->hasOneUse()) {
Instruction *Ext = I->user_back();
@@ -8740,13 +8796,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
}
auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
- // Do not count cost here if minimum bitwidth is in effect and it is just
- // a bitcast (here it is just a noop).
- if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
- return TTI::TCC_Free;
- auto *VI = VL0->getOpcode() == Opcode
- ? cast<Instruction>(UniqueValues[Idx])
- : nullptr;
+ auto *VI = cast<Instruction>(UniqueValues[Idx]);
return TTI->getCastInstrCost(Opcode, VL0->getType(),
VL0->getOperand(0)->getType(),
TTI::getCastContextHint(VI), CostKind, VI);
@@ -8789,7 +8839,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
- return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+ return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
Builder.getInt1Ty(), CurrentPred, CostKind,
VI);
};
@@ -8844,7 +8894,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI::OperandValueInfo Op2Info =
TTI::getOperandInfo(VI->getOperand(OpIdx));
SmallVector<const Value *> Operands(VI->operand_values());
- return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
+ return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
Op1Info, Op2Info, Operands, VI);
};
auto GetVectorCost = [=](InstructionCost CommonCost) {
@@ -8863,9 +8913,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Load: {
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
- return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
- VI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), VI);
+ return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
+ VI->getAlign(), VI->getPointerAddressSpace(),
+ CostKind, TTI::OperandValueInfo(), VI);
};
auto *LI0 = cast<LoadInst>(VL0);
auto GetVectorCost = [&](InstructionCost CommonCost) {
@@ -8908,9 +8958,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<StoreInst>(VL[Idx]);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
- return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
- VI->getPointerAddressSpace(), CostKind,
- OpInfo, VI);
+ return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
+ VI->getAlign(), VI->getPointerAddressSpace(),
+ CostKind, OpInfo, VI);
};
auto *BaseSI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
@@ -9772,6 +9822,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost -= InsertCost;
}
+ // Add the cost for reduced value resize (if required).
+ if (ReductionBitWidth != 0) {
+ assert(UserIgnoreList && "Expected reduction tree.");
+ const TreeEntry &E = *VectorizableTree.front().get();
+ auto It = MinBWs.find(&E);
+ if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
+ unsigned SrcSize = It->second.first;
+ unsigned DstSize = ReductionBitWidth;
+ unsigned Opcode = Instruction::Trunc;
+ if (SrcSize < DstSize)
+ Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ auto *SrcVecTy =
+ FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
+ auto *DstVecTy =
+ FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
+ TTI::CastContextHint CCH = getCastContextHint(E);
+ InstructionCost CastCost;
+ switch (E.getOpcode()) {
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ case Instruction::Trunc: {
+ const TreeEntry *OpTE = getOperandEntry(&E, 0);
+ CCH = getCastContextHint(*OpTE);
+ break;
+ }
+ default:
+ break;
+ }
+ CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+ TTI::TCK_RecipThroughput);
+ Cost += CastCost;
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
+ << " for final resize for reduction from " << SrcVecTy
+ << " to " << DstVecTy << "\n";
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ }
+ }
+
#ifndef NDEBUG
SmallString<256> Str;
{
@@ -10042,7 +10130,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
continue;
VTE = *It->getSecond().begin();
// Iterate through all vectorized nodes.
- auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+ auto *MIt = find_if(It->getSecond(), [&](const TreeEntry *MTE) {
return MTE->State == TreeEntry::Vectorize;
});
if (MIt == It->getSecond().end())
@@ -10053,11 +10141,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
- auto It = MinBWs.find(VTE);
- // If vectorize node is demoted - do not match.
- if (It != MinBWs.end() &&
- It->second.first != DL->getTypeSizeInBits(V->getType()))
- continue;
VToTEs.insert(VTE);
}
if (VToTEs.empty())
@@ -10105,6 +10188,57 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
return std::nullopt;
}
+ // Filter out entries with larger bitwidth of elements.
+ Type *ScalarTy = VL.front()->getType();
+ unsigned BitWidth = 0;
+ if (ScalarTy->isIntegerTy()) {
+ // Check if the used TEs supposed to be resized and choose the best
+ // candidates.
+ BitWidth = DL->getTypeStoreSize(ScalarTy);
+ if (TEUseEI.UserTE->getOpcode() != Instruction::Select ||
+ TEUseEI.EdgeIdx != 0) {
+ auto UserIt = MinBWs.find(TEUseEI.UserTE);
+ if (UserIt != MinBWs.end())
+ BitWidth = UserIt->second.second;
+ }
+ // Check if the used TEs supposed to be resized and choose the best
+ // candidates.
+ unsigned NodesBitWidth = 0;
+ auto CheckBitwidth = [&](const TreeEntry &TE) {
+ unsigned TEBitWidth = BitWidth;
+ auto UserIt = MinBWs.find(TEUseEI.UserTE);
+ if (UserIt != MinBWs.end())
+ TEBitWidth = UserIt->second.second;
+ if (BitWidth <= TEBitWidth) {
+ if (NodesBitWidth == 0)
+ NodesBitWidth = TEBitWidth;
+ return NodesBitWidth == TEBitWidth;
+ }
+ return false;
+ };
+ for (auto [Idx, Set] : enumerate(UsedTEs)) {
+ DenseSet<const TreeEntry *> ForRemoval;
+ for (const TreeEntry *TE : Set) {
+ if (!CheckBitwidth(*TE))
+ ForRemoval.insert(TE);
+ }
+ // All elements must be removed - remove the whole container.
+ if (ForRemoval.size() == Set.size()) {
+ Set.clear();
+ continue;
+ }
+ for (const TreeEntry *TE : ForRemoval)
+ Set.erase(TE);
+ }
+ for (auto *It = UsedTEs.begin(); It != UsedTEs.end();) {
+ if (It->empty()) {
+ UsedTEs.erase(It);
+ continue;
+ }
+ std::advance(It, 1);
+ }
+ }
+
unsigned VF = 0;
if (UsedTEs.size() == 1) {
// Keep the order to avoid non-determinism.
@@ -12929,7 +13063,21 @@ Value *BoUpSLP::vectorizeTree(
Builder.ClearInsertionPoint();
InstrElementSize.clear();
- return VectorizableTree[0]->VectorizedValue;
+ const TreeEntry &RootTE = *VectorizableTree.front().get();
+ Value *Vec = RootTE.VectorizedValue;
+ if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
+ It != MinBWs.end() &&
+ ReductionBitWidth != It->second.first) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(ReductionRoot->getParent(),
+ ReductionRoot->getIterator());
+ Vec = Builder.CreateIntCast(
+ Vec,
+ VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+ cast<VectorType>(Vec->getType())->getElementCount()),
+ It->second.second);
+ }
+ return Vec;
}
void BoUpSLP::optimizeGatherSequence() {
@@ -13749,23 +13897,48 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
bool BoUpSLP::collectValuesToDemote(
- Value *V, SmallVectorImpl<Value *> &ToDemote,
+ Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<Value *> &ToDemote,
DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
- SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
+ DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote) const {
// We can always demote constants.
- if (isa<Constant>(V))
+ if (isa<Constant>(V)) {
+ MaxDepthLevel = 1;
return true;
+ }
+
+ if (DL->getTypeSizeInBits(V->getType()) == BitWidth) {
+ MaxDepthLevel = 1;
+ return true;
+ }
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
+ // TODO: improve handling of gathered values and others.
auto *I = dyn_cast<Instruction>(V);
- if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
- !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
+ const TreeEntry *ITE = I ? getTreeEntry(I) : nullptr;
+ if (!ITE || !Visited.insert(I).second || MultiNodeScalars.contains(I) ||
+ all_of(I->users(), [&](User *U) {
return isa<InsertElementInst>(U) && !getTreeEntry(U);
}))
return false;
+ auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
+ if (MultiNodeScalars.contain...
[truncated]
|
|
This improves overall analysis for minbitwidth in SLP. It allows to analyze the trees with store/insertelement root nodes. Also, instead of using single minbitwidth, detected from the very first analysis stage, it tries to detect the best one for each trunc/ext subtree in the graph and use it for the subtree. Results in better code and less vector register pressure. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0% SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar instructions remain scalar (good). Spec2017/x264 - the whole function idct4x4dc is vectorized using <16 x i16> instead of <16 x i32>, also zext/trunc are removed. In other places last vector zext/sext removed and replaced by extractelement + scalar zext/sext pair. MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by reduce or <4 x i8> Spec2017/imagick - Removed extra zext from 2 packs of the operations. Spec2017/parest - Removed extra zext, replaced by extractelement+scalar zext Spec2017/blender - the whole bunch of vector zext/sext replaced by extractelement+scalar zext/sext, some extra code vectorized in smaller types. Spec2006/gobmk - fixed cost estimation, some small code remains scalar. Original Pull Request: #84334 The patch has the same functionality (no test changes, no changes in benchmarks) as the original patch, just has some compile time improvements + fixes for xxhash unittest, discovered earlier in the previous version of the patch. Reviewers: Pull Request: #84536
Looks like 7f21678 broke aarch64 bootstrap. https://lab.llvm.org/buildbot/#/builders/176 |
Can you provide a reproducer? Unable to reproduce it myself |
I'm also seeing issues with this; I see failed asserts in some cases, and successful compilation but incorrect results in others. I'll follow up with repro instructions in a moment. |
Thanks, appreciate it. |
I'll revert the patches for now, but please provide reproducers, it will help to investigate the issues. |
I've run into two separate issues: typedef char *a;
int b, c;
void d() {
int colctr = b, colsum, i, j;
a e, f, g;
long h;
for (; c;) {
colsum = *e++ + *f++;
j = e[0] + f[0];
i = colsum;
for (; colctr; colctr--) {
e++;
f++;
j = e[0] + f[0];
*g++ = i = colsum;
colsum = j;
}
h = i + j;
*g = h;
}
} Compiled like this:
This regressed in ea429e1 / #84363, and is a clear and simple assert failure. This PR, and commit 7f21678, caused miscompilations (code doing the wrong thing at runtime), noticeable in ffmpeg. (I have also observed test failures in openh264, but I haven't had to verify if this is caused by this same commit or not, but it seems plausible.) To repro that, do something along these lines:
I've reproduced these failures with i686-mingw, x86_64-mingw, aarch64-mingw and aarch64-linux targets - I presume it's reproducible for x86_64 linux as well but I haven't tried that. |
Thanks, will investigate |
Thanks! If fixing it takes long, I'd appreicate a revert, but otherwise I'm fine waiting a bit for a forward fix if that's possible. |
AS I said, I'll revert it, better to commit fixed versions. Will revert ASAP. |
…false." This reverts commit ea429e1 to fix issues reported in #84536 (comment).
Think this also is causing a failure when building
|
The patches were reverted, it should be fixed for now |
Thanks, much appreciated!
I think this is the same issue as I had reduced above; building libjpeg (in some form) triggers failed asserts in But the cases that trigger no asserts but produce wrong results at runtime are probably harder to sort out. |
Thanks, I fixed one small bug and currently investigating the last one. |
This improves overall analysis for minbitwidth in SLP. It allows to analyze the trees with store/insertelement root nodes. Also, instead of using single minbitwidth, detected from the very first analysis stage, it tries to detect the best one for each trunc/ext subtree in the graph and use it for the subtree. Results in better code and less vector register pressure. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0% SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar instructions remain scalar (good). Spec2017/x264 - the whole function idct4x4dc is vectorized using <16 x i16> instead of <16 x i32>, also zext/trunc are removed. In other places last vector zext/sext removed and replaced by extractelement + scalar zext/sext pair. MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by reduce or <4 x i8> Spec2017/imagick - Removed extra zext from 2 packs of the operations. Spec2017/parest - Removed extra zext, replaced by extractelement+scalar zext Spec2017/blender - the whole bunch of vector zext/sext replaced by extractelement+scalar zext/sext, some extra code vectorized in smaller types. Spec2006/gobmk - fixed cost estimation, some small code remains scalar. Original Pull Request: #84334 The patch has the same functionality (no test changes, no changes in benchmarks) as the original patch, just has some compile time improvements + fixes for xxhash unittest, discovered earlier in the previous version of the patch. Reviewers: Pull Request: #84536
The reland 31eaf86 causes clang crash:
|
Need a reproducer |
Most probably I have a fix for it already, but I really need a reproducer |
I'm trying to reduce it. |
To repro: |
Will fix it ASAP |
Should be fixed in 3942bd2 |
Thanks for the quick fix. |
This improves overall analysis for minbitwidth in SLP. It allows to analyze the trees with store/insertelement root nodes. Also, instead of using single minbitwidth, detected from the very first analysis stage, it tries to detect the best one for each trunc/ext subtree in the graph and use it for the subtree. Results in better code and less vector register pressure. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0% SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar instructions remain scalar (good). Spec2017/x264 - the whole function idct4x4dc is vectorized using <16 x i16> instead of <16 x i32>, also zext/trunc are removed. In other places last vector zext/sext removed and replaced by extractelement + scalar zext/sext pair. MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by reduce or <4 x i8> Spec2017/imagick - Removed extra zext from 2 packs of the operations. Spec2017/parest - Removed extra zext, replaced by extractelement+scalar zext Spec2017/blender - the whole bunch of vector zext/sext replaced by extractelement+scalar zext/sext, some extra code vectorized in smaller types. Spec2006/gobmk - fixed cost estimation, some small code remains scalar. Original Pull Request: llvm#84334 The patch has the same functionality (no test changes, no changes in benchmarks) as the original patch, just has some compile time improvements + fixes for xxhash unittest, discovered earlier in the previous version of the patch. Reviewers: Pull Request: llvm#84536
This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: #84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.