Skip to content

Commit

Permalink
[SLP] Initial vectorization of non-power-of-2 ops. (#77790)
Browse files Browse the repository at this point in the history
This patch enables vectorization for non-power-of-2 VFs. Initially only
VFs where adding 1 makes the VF a power-of-2, i.e. we can still make
relatively effective use of the vectors.

It relies on the existing target cost-models to return accurate costs
for
non-power-of-2 vectors. I checked mostly AArch64 and X86 and
there the costs seem reasonable for the costs I checked, although
I expect there will be a need to refine both the cost-models and
lowering
to make most effective use of non-power-of-2 SLP vectorization.

Note that re-ordering and shuffling is not implemented for nodes
requiring padding yet to keep the initial implementation simpler.

The feature is guarded by a new flag, off by defaul for now.

PR: #77790
  • Loading branch information
fhahn committed Apr 13, 2024
1 parent df9c00b commit 6d66db3
Show file tree
Hide file tree
Showing 11 changed files with 947 additions and 499 deletions.
83 changes: 71 additions & 12 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));

static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));

// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
Expand Down Expand Up @@ -2829,6 +2833,14 @@ class BoUpSLP {
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) const;

/// Return true if this is a non-power-of-2 node.
bool isNonPowOf2Vec() const {
bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
"Reshuffling not supported with non-power-of-2 vectors yet.");
return IsNonPowerOf2;
}

#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
Expand Down Expand Up @@ -2994,9 +3006,11 @@ class BoUpSLP {
MustGather.insert(VL.begin(), VL.end());
}

if (UserTreeIdx.UserTE)
if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);

assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
"Reordering isn't implemented for non-power-of-2 nodes yet");
}
return Last;
}

Expand Down Expand Up @@ -4256,6 +4270,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
if (!Order.empty() && !isPowerOf2_32(VL.size())) {
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
"supported with VectorizeNonPowerOf2");
return LoadsState::Gather;
}

Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
Expand Down Expand Up @@ -4575,6 +4596,10 @@ static bool areTwoInsertFromSameBuildVector(

std::optional<BoUpSLP::OrdersType>
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
if (TE.isNonPowOf2Vec())
return std::nullopt;

// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
Expand Down Expand Up @@ -5145,6 +5170,10 @@ bool BoUpSLP::canReorderOperands(
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
if (UserTE->isNonPowOf2Vec())
return false;

for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
Expand Down Expand Up @@ -5318,6 +5347,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto AllowsReordering = [&](const TreeEntry *TE) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
if (TE->isNonPowOf2Vec())
return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
(IgnoreReorder && TE->Idx == 0))
Expand Down Expand Up @@ -5944,6 +5976,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
if (!isPowerOf2_32(VL.size()))
return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
Expand Down Expand Up @@ -6258,6 +6293,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
"for nodes with padding.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
}
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (NumUniqueScalarValues <= 1 ||
(UniquePositions.size() == 1 && all_of(UniqueValues,
Expand Down Expand Up @@ -7868,7 +7910,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
if (VectorizedLoads.contains(VL[I]))
continue;
GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
GatherCost +=
getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
}
// Exclude potentially vectorized loads from list of gathered
// scalars.
Expand Down Expand Up @@ -10678,6 +10721,9 @@ BoUpSLP::isGatherShuffledEntry(
// No need to check for the topmost gather node.
if (TE == VectorizableTree.front().get())
return {};
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
if (TE->isNonPowOf2Vec())
return {};
Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
Expand Down Expand Up @@ -14995,8 +15041,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
const unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = Chain.size();

if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
return false;
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
// Check if vectorizing with a non-power-of-2 VF should be considered. At
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
// all vector lanes are used.
if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
return false;
}

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
<< "\n");
Expand Down Expand Up @@ -15095,14 +15146,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
continue;
}

unsigned NonPowerOf2VF = 0;
if (VectorizeNonPowerOf2) {
// First try vectorizing with a non-power-of-2 VF. At the moment, only
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
// lanes are used.
unsigned CandVF = Operands.size();
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
NonPowerOf2VF = CandVF;
}

unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
SmallVector<unsigned> CandidateVFs(Sz);
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
unsigned Size = MaxVF;
for_each(CandidateVFs, [&](unsigned &VF) {
VF = Size;
Size /= 2;
SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
unsigned Size = MinVF;
for_each(reverse(CandidateVFs), [&](unsigned &VF) {
VF = Size > MaxVF ? NonPowerOf2VF : Size;
Size *= 2;
});
unsigned StartIdx = 0;
for (unsigned Size : CandidateVFs) {
Expand Down
70 changes: 39 additions & 31 deletions llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
Original file line number Diff line number Diff line change
@@ -1,35 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s

define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
; CHECK-NEXT: ret void
; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; NON-POW2-NEXT: store <15 x i8> [[TMP1]], ptr [[DST]], align 1
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
; POW2-ONLY-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
; POW2-ONLY-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
; POW2-ONLY-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
; POW2-ONLY-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
; POW2-ONLY-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
; POW2-ONLY-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
; POW2-ONLY-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
; POW2-ONLY-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
; POW2-ONLY-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
; POW2-ONLY-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
; POW2-ONLY-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
; POW2-ONLY-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
; POW2-ONLY-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
; POW2-ONLY-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
; POW2-ONLY-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
; POW2-ONLY-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
; POW2-ONLY-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
; POW2-ONLY-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
; POW2-ONLY-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
Expand Down Expand Up @@ -123,5 +133,3 @@ entry:

ret void
}


Loading

0 comments on commit 6d66db3

Please sign in to comment.