diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 36dc9094538ae..8cf59a18381ab 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1085,6 +1085,7 @@ class BoUpSLP { BS->clear(); } MinBWs.clear(); + ReductionBitWidth = 0; InstrElementSize.clear(); UserIgnoreList = nullptr; PostponedGathers.clear(); @@ -2307,9 +2308,11 @@ class BoUpSLP { /// constant and to be demoted. Required to correctly identify constant nodes /// to be demoted. bool collectValuesToDemote( - Value *V, SmallVectorImpl &ToDemote, + Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth, + SmallVectorImpl &ToDemote, DenseMap> &DemotedConsts, - SmallVectorImpl &Roots, DenseSet &Visited) const; + DenseSet &Visited, unsigned &MaxDepthLevel, + bool &IsProfitableToDemote) const; /// Check if the operands on the edges \p Edges of the \p UserTE allows /// reordering (i.e. the operands can be reordered because they have only one @@ -2375,6 +2378,9 @@ class BoUpSLP { /// \ returns the graph entry for the \p Idx operand of the \p E entry. const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; + /// \returns Cast context for the given graph node. + TTI::CastContextHint getCastContextHint(const TreeEntry &TE) const; + /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, @@ -3629,6 +3635,11 @@ class BoUpSLP { /// value must be signed-extended, rather than zero-extended, back to its /// original width. DenseMap> MinBWs; + + /// Final size of the reduced vector, if the current graph represents the + /// input for the reduction and it was possible to narrow the size of the + /// reduction. + unsigned ReductionBitWidth = 0; }; } // end namespace slpvectorizer @@ -8362,6 +8373,22 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, return It->get(); } +TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { + if (TE.State == TreeEntry::ScatterVectorize || + TE.State == TreeEntry::StridedVectorize) + return TTI::CastContextHint::GatherScatter; + if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && + !TE.isAltShuffle()) { + if (TE.ReorderIndices.empty()) + return TTI::CastContextHint::Normal; + SmallVector Mask; + inversePermutation(TE.ReorderIndices, Mask); + if (ShuffleVectorInst::isReverseMask(Mask, Mask.size())) + return TTI::CastContextHint::Reversed; + } + return TTI::CastContextHint::None; +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallPtrSetImpl &CheckedExtracts) { @@ -8384,6 +8411,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. auto It = MinBWs.find(E); + Type *OrigScalarTy = ScalarTy; if (It != MinBWs.end()) { ScalarTy = IntegerType::get(F->getContext(), It->second.first); VecTy = FixedVectorType::get(ScalarTy, VL.size()); @@ -8441,24 +8469,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, UsedScalars.set(I); } auto GetCastContextHint = [&](Value *V) { - if (const TreeEntry *OpTE = getTreeEntry(V)) { - if (OpTE->State == TreeEntry::ScatterVectorize || - OpTE->State == TreeEntry::StridedVectorize) - return TTI::CastContextHint::GatherScatter; - if (OpTE->State == TreeEntry::Vectorize && - OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) { - if (OpTE->ReorderIndices.empty()) - return TTI::CastContextHint::Normal; - SmallVector Mask; - inversePermutation(OpTE->ReorderIndices, Mask); - if (ShuffleVectorInst::isReverseMask(Mask, Mask.size())) - return TTI::CastContextHint::Reversed; - } - } else { - InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); - if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) - return TTI::CastContextHint::GatherScatter; - } + if (const TreeEntry *OpTE = getTreeEntry(V)) + return getCastContextHint(*OpTE); + InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); + if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) + return TTI::CastContextHint::GatherScatter; return TTI::CastContextHint::None; }; auto GetCostDiff = @@ -8507,8 +8522,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, TTI::CastContextHint CCH = GetCastContextHint(VL0); VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH, CostKind); - ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy, - ScalarTy, CCH, CostKind); } } } @@ -8525,7 +8538,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( - *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy); + *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy); LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, "Calculated GEPs cost for Tree")); @@ -8572,7 +8585,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, NumElts = ATy->getNumElements(); else NumElts = AggregateTy->getStructNumElements(); - SrcVecTy = FixedVectorType::get(ScalarTy, NumElts); + SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts); } if (I->hasOneUse()) { Instruction *Ext = I->user_back(); @@ -8740,13 +8753,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } } auto GetScalarCost = [&](unsigned Idx) -> InstructionCost { - // Do not count cost here if minimum bitwidth is in effect and it is just - // a bitcast (here it is just a noop). - if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) - return TTI::TCC_Free; - auto *VI = VL0->getOpcode() == Opcode - ? cast(UniqueValues[Idx]) - : nullptr; + auto *VI = cast(UniqueValues[Idx]); return TTI->getCastInstrCost(Opcode, VL0->getType(), VL0->getOperand(0)->getType(), TTI::getCastContextHint(VI), CostKind, VI); @@ -8789,7 +8796,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; - return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred, CostKind, VI); }; @@ -8844,7 +8851,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(VI->getOperand(OpIdx)); SmallVector Operands(VI->operand_values()); - return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind, + return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands, VI); }; auto GetVectorCost = [=](InstructionCost CommonCost) { @@ -8863,9 +8870,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, case Instruction::Load: { auto GetScalarCost = [&](unsigned Idx) { auto *VI = cast(UniqueValues[Idx]); - return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), - VI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), VI); + return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy, + VI->getAlign(), VI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), VI); }; auto *LI0 = cast(VL0); auto GetVectorCost = [&](InstructionCost CommonCost) { @@ -8908,9 +8915,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto GetScalarCost = [=](unsigned Idx) { auto *VI = cast(VL[Idx]); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand()); - return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(), - VI->getPointerAddressSpace(), CostKind, - OpInfo, VI); + return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy, + VI->getAlign(), VI->getPointerAddressSpace(), + CostKind, OpInfo, VI); }; auto *BaseSI = cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); @@ -9772,6 +9779,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { Cost -= InsertCost; } + // Add the cost for reduced value resize (if required). + if (ReductionBitWidth != 0) { + assert(UserIgnoreList && "Expected reduction tree."); + const TreeEntry &E = *VectorizableTree.front().get(); + auto It = MinBWs.find(&E); + if (It != MinBWs.end() && It->second.first != ReductionBitWidth) { + unsigned SrcSize = It->second.first; + unsigned DstSize = ReductionBitWidth; + unsigned Opcode = Instruction::Trunc; + if (SrcSize < DstSize) + Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; + auto *SrcVecTy = + FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor()); + auto *DstVecTy = + FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor()); + TTI::CastContextHint CCH = getCastContextHint(E); + InstructionCost CastCost; + switch (E.getOpcode()) { + case Instruction::SExt: + case Instruction::ZExt: + case Instruction::Trunc: { + const TreeEntry *OpTE = getOperandEntry(&E, 0); + CCH = getCastContextHint(*OpTE); + break; + } + default: + break; + } + CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH, + TTI::TCK_RecipThroughput); + Cost += CastCost; + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost + << " for final resize for reduction from " << SrcVecTy + << " to " << DstVecTy << "\n"; + dbgs() << "SLP: Current total cost = " << Cost << "\n"); + } + } + #ifndef NDEBUG SmallString<256> Str; { @@ -12929,7 +12974,21 @@ Value *BoUpSLP::vectorizeTree( Builder.ClearInsertionPoint(); InstrElementSize.clear(); - return VectorizableTree[0]->VectorizedValue; + const TreeEntry &RootTE = *VectorizableTree.front().get(); + Value *Vec = RootTE.VectorizedValue; + if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 && + It != MinBWs.end() && + ReductionBitWidth != It->second.first) { + IRBuilder<>::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(ReductionRoot->getParent(), + ReductionRoot->getIterator()); + Vec = Builder.CreateIntCast( + Vec, + VectorType::get(Builder.getIntNTy(ReductionBitWidth), + cast(Vec->getType())->getElementCount()), + It->second.second); + } + return Vec; } void BoUpSLP::optimizeGatherSequence() { @@ -13749,16 +13808,21 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // smaller type with a truncation. We collect the values that will be demoted // in ToDemote and additional roots that require investigating in Roots. bool BoUpSLP::collectValuesToDemote( - Value *V, SmallVectorImpl &ToDemote, + Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth, + SmallVectorImpl &ToDemote, DenseMap> &DemotedConsts, - SmallVectorImpl &Roots, DenseSet &Visited) const { + DenseSet &Visited, unsigned &MaxDepthLevel, + bool &IsProfitableToDemote) const { // We can always demote constants. - if (isa(V)) + if (isa(V)) { + MaxDepthLevel = 1; return true; + } // If the value is not a vectorized instruction in the expression and not used // by the insertelement instruction and not used in multiple vector nodes, it // cannot be demoted. + // TODO: improve handling of gathered values and others. auto *I = dyn_cast(V); if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) || !Visited.insert(I).second || all_of(I->users(), [&](User *U) { @@ -13766,6 +13830,20 @@ bool BoUpSLP::collectValuesToDemote( })) return false; + auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool { + if (MultiNodeScalars.contains(V)) + return false; + uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType()); + APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) + return true; + auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); + unsigned BitWidth1 = OrigBitWidth - NumSignBits; + if (!isKnownNonNegative(V, SimplifyQuery(*DL))) + ++BitWidth1; + BitWidth = std::max(BitWidth, BitWidth1); + return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); + }; unsigned Start = 0; unsigned End = I->getNumOperands(); switch (I->getOpcode()) { @@ -13773,12 +13851,14 @@ bool BoUpSLP::collectValuesToDemote( // We can always demote truncations and extensions. Since truncations can // seed additional demotion, we save the truncated value. case Instruction::Trunc: - Roots.push_back(I->getOperand(0)); + MaxDepthLevel = 1; + if (IsProfitableToDemoteRoot) + IsProfitableToDemote = true; break; case Instruction::ZExt: case Instruction::SExt: - if (isa(I->getOperand(0))) - return false; + MaxDepthLevel = 1; + IsProfitableToDemote = true; break; // We can demote certain binary operations if we can demote both of their @@ -13788,23 +13868,32 @@ bool BoUpSLP::collectValuesToDemote( case Instruction::Mul: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots, - Visited) || - !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots, - Visited)) + case Instruction::Xor: { + unsigned Level1, Level2; + if (!collectValuesToDemote(I->getOperand(0), IsProfitableToDemoteRoot, + BitWidth, ToDemote, DemotedConsts, Visited, + Level1, IsProfitableToDemote) || + !collectValuesToDemote(I->getOperand(1), IsProfitableToDemoteRoot, + BitWidth, ToDemote, DemotedConsts, Visited, + Level2, IsProfitableToDemote)) return false; + MaxDepthLevel = std::max(Level1, Level2); break; + } // We can demote selects if we can demote their true and false values. case Instruction::Select: { Start = 1; + unsigned Level1, Level2; SelectInst *SI = cast(I); - if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts, - Roots, Visited) || - !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts, - Roots, Visited)) + if (!collectValuesToDemote(SI->getTrueValue(), IsProfitableToDemoteRoot, + BitWidth, ToDemote, DemotedConsts, Visited, + Level1, IsProfitableToDemote) || + !collectValuesToDemote(SI->getFalseValue(), IsProfitableToDemoteRoot, + BitWidth, ToDemote, DemotedConsts, Visited, + Level2, IsProfitableToDemote)) return false; + MaxDepthLevel = std::max(Level1, Level2); break; } @@ -13813,171 +13902,236 @@ bool BoUpSLP::collectValuesToDemote( case Instruction::PHI: { PHINode *PN = cast(I); for (Value *IncValue : PN->incoming_values()) - if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots, - Visited)) + if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth, + ToDemote, DemotedConsts, Visited, + MaxDepthLevel, IsProfitableToDemote)) return false; break; } // Otherwise, conservatively give up. default: - return false; + if (!IsPotentiallyTruncated(I, BitWidth)) + return false; + MaxDepthLevel = 0; + Start = End = 0; + break; } + ++MaxDepthLevel; // Gather demoted constant operands. for (unsigned Idx : seq(Start, End)) if (isa(I->getOperand(Idx))) DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx); // Record the value that we can demote. ToDemote.push_back(V); - return true; + return IsProfitableToDemote; } void BoUpSLP::computeMinimumValueSizes() { // We only attempt to truncate integer expressions. - auto &TreeRoot = VectorizableTree[0]->Scalars; - auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); - if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather) - return; + bool IsStoreOrInsertElt = + VectorizableTree.front()->getOpcode() == Instruction::Store || + VectorizableTree.front()->getOpcode() == Instruction::InsertElement; + unsigned NodeIdx = 0; + if (IsStoreOrInsertElt && + VectorizableTree.front()->State != TreeEntry::NeedToGather) + NodeIdx = 1; // Ensure the roots of the vectorizable tree don't form a cycle. - if (!VectorizableTree.front()->UserTreeIndices.empty()) + if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather || + (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) || + (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices, + [NodeIdx](const EdgeInfo &EI) { + return EI.UserTE->Idx > + static_cast(NodeIdx); + }))) return; - // Conservatively determine if we can actually truncate the roots of the - // expression. Collect the values that can be demoted in ToDemote and - // additional roots that require investigating in Roots. - SmallVector ToDemote; - DenseMap> DemotedConsts; - SmallVector Roots; - for (auto *Root : TreeRoot) { - DenseSet Visited; - if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited)) - return; + // The first value node for store/insertelement is sext/zext/trunc? Skip it, + // resize to the final type. + bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt; + if (NodeIdx != 0 && + VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && + (VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt || + VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt || + VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) { + assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph."); + ++NodeIdx; + IsProfitableToDemoteRoot = true; } - // The maximum bit width required to represent all the values that can be - // demoted without loss of precision. It would be safe to truncate the roots - // of the expression to this width. - auto MaxBitWidth = 1u; - - // We first check if all the bits of the roots are demanded. If they're not, - // we can truncate the roots to this narrower type. - for (auto *Root : TreeRoot) { - auto Mask = DB->getDemandedBits(cast(Root)); - MaxBitWidth = std::max(Mask.getBitWidth() - Mask.countl_zero(), - MaxBitWidth); - } - - // True if the roots can be zero-extended back to their original type, rather - // than sign-extended. We know that if the leading bits are not demanded, we - // can safely zero-extend. So we initialize IsKnownPositive to True. - bool IsKnownPositive = true; - - // If all the bits of the roots are demanded, we can try a little harder to - // compute a narrower type. This can happen, for example, if the roots are - // getelementptr indices. InstCombine promotes these indices to the pointer - // width. Thus, all their bits are technically demanded even though the - // address computation might be vectorized in a smaller type. - // - // We start by looking at each entry that can be demoted. We compute the - // maximum bit width required to store the scalar by using ValueTracking to - // compute the number of high-order bits we can truncate. - if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && - all_of(TreeRoot, [](Value *V) { - return all_of(V->users(), - [](User *U) { return isa(U); }); - })) { - MaxBitWidth = 8u; - + SmallVector ToDemote; + DenseMap> DemotedConsts; + auto ComputeMaxBitWidth = [&](ArrayRef TreeRoot, unsigned VF, + bool IsTopRoot, bool IsProfitableToDemoteRoot, + unsigned Opcode, unsigned Limit) { + ToDemote.clear(); + auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); + if (!TreeRootIT || !Opcode) + return 0u; + + unsigned NumParts = TTI->getNumberOfParts( + FixedVectorType::get(TreeRoot.front()->getType(), VF)); + + // The maximum bit width required to represent all the values that can be + // demoted without loss of precision. It would be safe to truncate the roots + // of the expression to this width. + unsigned MaxBitWidth = 1u; + + // True if the roots can be zero-extended back to their original type, + // rather than sign-extended. We know that if the leading bits are not + // demanded, we can safely zero-extend. So we initialize IsKnownPositive to + // True. // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. - IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { + bool IsKnownPositive = all_of(TreeRoot, [&](Value *R) { KnownBits Known = computeKnownBits(R, *DL); return Known.isNonNegative(); }); - // Determine the maximum number of bits required to store the scalar - // values. - for (auto *Scalar : ToDemote) { - auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); - auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); - MaxBitWidth = std::max(NumTypeBits - NumSignBits, MaxBitWidth); - } - - // If we can't prove that the sign bit is zero, we must add one to the - // maximum bit width to account for the unknown sign bit. This preserves - // the existing sign bit so we can safely sign-extend the root back to the - // original type. Otherwise, if we know the sign bit is zero, we will - // zero-extend the root instead. - // - // FIXME: This is somewhat suboptimal, as there will be cases where adding - // one to the maximum bit width will yield a larger-than-necessary - // type. In general, we need to add an extra bit only if we can't - // prove that the upper bit of the original type is equal to the - // upper bit of the proposed smaller type. If these two bits are the - // same (either zero or one) we know that sign-extending from the - // smaller type will result in the same value. Here, since we can't - // yet prove this, we are just making the proposed smaller type - // larger to ensure correctness. - if (!IsKnownPositive) - ++MaxBitWidth; - } - - // Round MaxBitWidth up to the next power-of-two. - MaxBitWidth = llvm::bit_ceil(MaxBitWidth); - - // If the maximum bit width we compute is less than the with of the roots' - // type, we can proceed with the narrowing. Otherwise, do nothing. - if (MaxBitWidth >= TreeRootIT->getBitWidth()) - return; + // We first check if all the bits of the roots are demanded. If they're not, + // we can truncate the roots to this narrower type. + for (auto *Root : TreeRoot) { + unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT); + TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType()); + unsigned BitWidth1 = NumTypeBits - NumSignBits; + // If we can't prove that the sign bit is zero, we must add one to the + // maximum bit width to account for the unknown sign bit. This preserves + // the existing sign bit so we can safely sign-extend the root back to the + // original type. Otherwise, if we know the sign bit is zero, we will + // zero-extend the root instead. + // + // FIXME: This is somewhat suboptimal, as there will be cases where adding + // one to the maximum bit width will yield a larger-than-necessary + // type. In general, we need to add an extra bit only if we can't + // prove that the upper bit of the original type is equal to the + // upper bit of the proposed smaller type. If these two bits are + // the same (either zero or one) we know that sign-extending from + // the smaller type will result in the same value. Here, since we + // can't yet prove this, we are just making the proposed smaller + // type larger to ensure correctness. + if (!IsKnownPositive) + ++BitWidth1; + + APInt Mask = DB->getDemandedBits(cast(Root)); + unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); + MaxBitWidth = + std::max(std::min(BitWidth1, BitWidth2), MaxBitWidth); + } + + if (MaxBitWidth < 8 && MaxBitWidth > 1) + MaxBitWidth = 8; + + // If the original type is large, but reduced type does not improve the reg + // use - ignore it. + if (NumParts > 1 && + NumParts == + TTI->getNumberOfParts(FixedVectorType::get( + IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF))) + return 0u; + + bool IsProfitableToDemote = Opcode == Instruction::Trunc || + Opcode == Instruction::SExt || + Opcode == Instruction::ZExt || NumParts > 1; + // Conservatively determine if we can actually truncate the roots of the + // expression. Collect the values that can be demoted in ToDemote and + // additional roots that require investigating in Roots. + for (auto *Root : TreeRoot) { + DenseSet Visited; + unsigned MaxDepthLevel; + bool NeedToDemote = IsProfitableToDemote; + if (!collectValuesToDemote(Root, IsProfitableToDemoteRoot, MaxBitWidth, + ToDemote, DemotedConsts, Visited, + MaxDepthLevel, NeedToDemote) || + (MaxDepthLevel <= Limit && Opcode != Instruction::Trunc && + Opcode != Instruction::SExt && Opcode != Instruction::ZExt)) + return 0u; + } + // Round MaxBitWidth up to the next power-of-two. + MaxBitWidth = bit_ceil(MaxBitWidth); + + return MaxBitWidth; + }; // If we can truncate the root, we must collect additional values that might // be demoted as a result. That is, those seeded by truncations we will // modify. - while (!Roots.empty()) { - DenseSet Visited; - collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots, - Visited); - } - - // Check that all users are marked for demotion. - DenseSet Demoted(ToDemote.begin(), ToDemote.end()); - DenseSet Visited; - for (Value *V: ToDemote) { - const TreeEntry *TE = getTreeEntry(V); - assert(TE && "Expected vectorized scalar."); - if (!Visited.insert(TE).second) - continue; - if (!all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return all_of(EI.UserTE->Scalars, - [&](Value *V) { return Demoted.contains(V); }); - })) - return; - } - // Finally, map the values we can demote to the maximum bit with we computed. - for (auto *Scalar : ToDemote) { - auto *TE = getTreeEntry(Scalar); - assert(TE && "Expected vectorized scalar."); - if (MinBWs.contains(TE)) + // Add reduction ops sizes, if any. + if (UserIgnoreList && + isa(VectorizableTree.front()->Scalars.front()->getType())) { + for (Value *V : *UserIgnoreList) { + auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); + auto NumTypeBits = DL->getTypeSizeInBits(V->getType()); + unsigned BitWidth1 = NumTypeBits - NumSignBits; + if (!isKnownNonNegative(V, SimplifyQuery(*DL))) + ++BitWidth1; + auto Mask = DB->getDemandedBits(cast(V)); + unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); + ReductionBitWidth = + std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth); + } + if (ReductionBitWidth < 8 && ReductionBitWidth > 1) + ReductionBitWidth = 8; + + ReductionBitWidth = bit_ceil(ReductionBitWidth); + } + bool IsTopRoot = NodeIdx == 0; + while (NodeIdx < VectorizableTree.size()) { + ArrayRef TreeRoot = VectorizableTree[NodeIdx]->Scalars; + unsigned Limit = 2; + if (NodeIdx == 0 && + ReductionBitWidth == DL->getTypeSizeInBits(TreeRoot.front()->getType())) + Limit = 3; + unsigned MaxBitWidth = ComputeMaxBitWidth( + TreeRoot, VectorizableTree[NodeIdx]->getVectorFactor(), IsTopRoot, + IsProfitableToDemoteRoot, VectorizableTree[NodeIdx]->getOpcode(), + Limit); + IsTopRoot = false; + IsProfitableToDemoteRoot = true; + + ++NodeIdx; + for (unsigned E = VectorizableTree.size(); NodeIdx < E; ++NodeIdx) { + if (VectorizableTree[NodeIdx]->State != TreeEntry::NeedToGather && + !VectorizableTree[NodeIdx]->isAltShuffle() && + VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { + ++NodeIdx; + break; + } + } + + // If the maximum bit width we compute is less than the with of the roots' + // type, we can proceed with the narrowing. Otherwise, do nothing. + if (MaxBitWidth == 0 || + MaxBitWidth >= + cast(TreeRoot.front()->getType())->getBitWidth()) continue; - bool IsSigned = any_of(TE->Scalars, [&](Value *R) { - KnownBits Known = computeKnownBits(R, *DL); - return !Known.isNonNegative(); - }); - MinBWs.try_emplace(TE, MaxBitWidth, IsSigned); - const auto *I = cast(Scalar); - auto DCIt = DemotedConsts.find(I); - if (DCIt != DemotedConsts.end()) { - for (unsigned Idx : DCIt->getSecond()) { - // Check that all instructions operands are demoted. - if (all_of(TE->Scalars, [&](Value *V) { - auto SIt = DemotedConsts.find(cast(V)); - return SIt != DemotedConsts.end() && - is_contained(SIt->getSecond(), Idx); - })) { + + // Finally, map the values we can demote to the maximum bit with we + // computed. + for (Value *Scalar : ToDemote) { + TreeEntry *TE = getTreeEntry(Scalar); + assert(TE && "Expected vectorized scalar."); + if (MinBWs.contains(TE)) + continue; + bool IsSigned = any_of(TE->Scalars, [&](Value *R) { + return !isKnownNonNegative(R, SimplifyQuery(*DL)); + }); + MinBWs.try_emplace(TE, MaxBitWidth, IsSigned); + const auto *I = cast(Scalar); + auto DCIt = DemotedConsts.find(I); + if (DCIt != DemotedConsts.end()) { + for (unsigned Idx : DCIt->getSecond()) { + // Check that all instructions operands are demoted. const TreeEntry *CTE = getOperandEntry(TE, Idx); - MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned); + if (all_of(TE->Scalars, + [&](Value *V) { + auto SIt = DemotedConsts.find(cast(V)); + return SIt != DemotedConsts.end() && + is_contained(SIt->getSecond(), Idx); + }) || + all_of(CTE->Scalars, Constant::classof)) + MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned); } } } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll index cef791633655a..5e3fd156666f5 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -17,12 +17,13 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) { ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4 ; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 ; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll index 47485e514ec2f..1cce52060c479 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll index d67fdc1cd6aa0..a7a7f642ced53 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll @@ -28,21 +28,11 @@ entry: define i64 @red_zext_ld_4xi64(ptr %ptr) { ; CHECK-LABEL: @red_zext_ld_4xi64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1 -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1 -; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]] -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2 -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1 -; CHECK-NEXT: [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]] -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3 -; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1 -; CHECK-NEXT: [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]] -; CHECK-NEXT: ret i64 [[ADD_3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64 +; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %ld0 = load i8, ptr %ptr diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 000e7a56df377..500f10659f04c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -802,9 +802,10 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) { ; CHECK-LABEL: @red_zext_ld_4xi64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]]) -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64 +; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %ld0 = load i8, ptr %ptr diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll index 4565d4928ba4a..05511f843a68f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -15,11 +15,12 @@ define { i64, i64 } @patatino(double %arg) { ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16 ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 ; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1 ; CHECK-NEXT: ret { i64, i64 } [[T17]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index a0af8e36b36c7..5ee8016076538 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s define void @t(i64 %v) { ; CHECK-LABEL: define void @t( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll index 6e512fcbb7392..6051638562b59 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll @@ -6,18 +6,17 @@ define void @test(i8 %0) { ; CHECK-SAME: i8 [[TMP0:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> , i8 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = zext i16 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 1 ; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SHR]] to i8 ; CHECK-NEXT: store i8 [[CONV9]], ptr null, align 1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 2c834616becc0..4acd63078b82e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -6,15 +6,20 @@ define void @test(i64 %d.promoted.i) { ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i64> , i64 [[AND_1_I_1]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> [[TMP0]], i64 [[AND_1_I]], i32 9 -; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i1> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 0 -; CHECK-NEXT: store i32 [[TMP6]], ptr null, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 +; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll index 651631de2c35a..a316415dcc6b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -17,12 +17,15 @@ target triple = "x86_64-unknown-linux-gnu" define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_zext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 -; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 -; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]] -; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]] +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -73,12 +76,15 @@ entry: define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_sext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 -; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 -; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]] -; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]] +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64 +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -89,13 +95,12 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 ; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0 -; AVX-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 -; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]] -; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1 -; AVX-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64 +; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64 +; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]] ; AVX-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; AVX-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; AVX-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll index 88f75c37846ef..3cc32c1fc7b28 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll @@ -15,8 +15,8 @@ define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: @@ -52,8 +52,8 @@ define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: @@ -89,8 +89,8 @@ define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: @@ -127,8 +127,8 @@ define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %ar ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: @@ -165,8 +165,8 @@ define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %a ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: @@ -202,8 +202,8 @@ define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index 78c6d9516a3de..b7237cbb02bb3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,26 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], ; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 ; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 ; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i8> [[TMP13]], -; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr undef, align 1 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], +; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll index 5d22b5a4873be..1d1fcec2a7aeb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll @@ -7,12 +7,10 @@ define i1 @test(i1 %cmp5.not.31) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> , i1 [[CMP5_NOT_31]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 0 -; CHECK-NEXT: [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 0 +; CHECK-NEXT: [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP4]], 0 ; CHECK-NEXT: ret i1 [[CMP_NOT_I_I]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll index c1dd90d0e9a7b..2f6868d8dfd62 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll @@ -8,17 +8,18 @@ ; YAML-NEXT: Function: stores ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-3' +; YAML-NEXT: - Cost: '-7' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '6' define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) { ; CHECK-LABEL: @stores( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]] -; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; %load.1 = load i8, ptr %in, align 1 @@ -63,17 +64,18 @@ define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) { ; YAML-NEXT: Function: insertelems ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-5' +; YAML-NEXT: - Cost: '-9' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '6' define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) { ; CHECK-LABEL: @insertelems( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]] -; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP6]] ; %load.1 = load i8, ptr %in, align 1 %gep.1 = getelementptr inbounds i8, ptr %in, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll index 061fbdb45a13b..ff6f0bdd3db8f 100644 --- a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll @@ -10,8 +10,8 @@ define i32 @alt_cmp(i16 %call46) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i16> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i16 ; CHECK-NEXT: [[OP_RDX:%.*]] = or i16 [[TMP6]], 0 ; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[OP_RDX]] to i32 ; CHECK-NEXT: ret i32 [[EXT]]