Skip to content

Commit

Permalink
[SLP]Improve/fix reordering of the gathered graph nodes.
Browse files Browse the repository at this point in the history
Gathered loads/extractelements/extractvalue instructions should be
checked if they can represent a vector reordering node too and their
order should ve taken into account for better graph reordering analysis/
Also, if the gather node has reused scalars, they must be reordered
instead of the scalars themselves.

Differential Revision: https://reviews.llvm.org/D112454
  • Loading branch information
alexey-bataev committed Oct 27, 2021
1 parent 9d7006c commit 64d1617
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 89 deletions.
172 changes: 133 additions & 39 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -766,6 +766,12 @@ class BoUpSLP {
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();

/// Checks if the specified gather tree entry \p TE can be represented as a
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
/// of the same size (vectorization factor). Smaller nodes are considered
Expand Down Expand Up @@ -2670,6 +2676,72 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
fixupOrderingIndices(Order);
}

Optional<BoUpSLP::OrdersType>
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
unsigned NumScalars = TE.Scalars.size();
OrdersType CurrentOrder(NumScalars, NumScalars);
SmallVector<int> Positions;
SmallBitVector UsedPositions(NumScalars);
const TreeEntry *STE = nullptr;
// Try to find all gathered scalars that are gets vectorized in other
// vectorize node. Here we can have only one single tree vector node to
// correctly identify order of the gathered scalars.
for (unsigned I = 0; I < NumScalars; ++I) {
Value *V = TE.Scalars[I];
if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
continue;
if (const auto *LocalSTE = getTreeEntry(V)) {
if (!STE)
STE = LocalSTE;
else if (STE != LocalSTE)
// Take the order only from the single vector node.
return None;
unsigned Lane =
std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
if (Lane >= NumScalars)
return None;
if (CurrentOrder[Lane] != NumScalars) {
if (Lane != I)
continue;
UsedPositions.reset(CurrentOrder[Lane]);
}
// The partial identity (where only some elements of the gather node are
// in the identity order) is good.
CurrentOrder[Lane] = I;
UsedPositions.set(I);
}
}
// Need to keep the order if we have a vector entry and at least 2 scalars or
// the vectorized entry has just 2 scalars.
if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
for (unsigned I = 0; I < NumScalars; ++I)
if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
return false;
return true;
};
if (IsIdentityOrder(CurrentOrder)) {
CurrentOrder.clear();
return CurrentOrder;
}
auto *It = CurrentOrder.begin();
for (unsigned I = 0; I < NumScalars;) {
if (UsedPositions.test(I)) {
++I;
continue;
}
if (*It == NumScalars) {
*It = I;
++I;
}
++It;
}
return CurrentOrder;
}
return None;
}

void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
Expand All @@ -2689,19 +2761,29 @@ void BoUpSLP::reorderTopToBottom() {
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
} else if (TE->State == TreeEntry::NeedToGather &&
TE->getOpcode() == Instruction::ExtractElement &&
!TE->isAltShuffle() &&
isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
->getVectorOperandType()) &&
allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector.
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
if (Reuse || !CurrentOrder.empty()) {
return;
}
if (TE->State == TreeEntry::NeedToGather) {
if (TE->getOpcode() == Instruction::ExtractElement &&
!TE->isAltShuffle() &&
isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
->getVectorOperandType()) &&
allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector.
OrdersType CurrentOrder;
bool Reuse =
canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
if (Reuse || !CurrentOrder.empty()) {
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
GathersToOrders.try_emplace(TE.get(), CurrentOrder);
return;
}
}
if (Optional<OrdersType> CurrentOrder =
findReusedOrderedScalars(*TE.get())) {
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
GathersToOrders.try_emplace(TE.get(), CurrentOrder);
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
}
});
Expand Down Expand Up @@ -2753,7 +2835,7 @@ void BoUpSLP::reorderTopToBottom() {
// Choose the most used order.
ArrayRef<unsigned> BestOrder = OrdersUses.begin()->first;
unsigned Cnt = OrdersUses.begin()->second;
for (const auto &Pair : llvm::drop_begin(OrdersUses)) {
for (const auto &Pair : drop_begin(OrdersUses)) {
if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
BestOrder = Pair.first;
Cnt = Pair.second;
Expand Down Expand Up @@ -2830,6 +2912,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
&NonVectorized](
const std::unique_ptr<TreeEntry> &TE) {
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE->ReuseShuffleIndices.empty())
Expand All @@ -2838,28 +2922,37 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
OrderedEntries.insert(TE.get());
} else if (TE->State == TreeEntry::NeedToGather &&
TE->getOpcode() == Instruction::ExtractElement &&
!TE->isAltShuffle() &&
isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
->getVectorOperandType()) &&
allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector with a single user only.
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
if ((Reuse || !CurrentOrder.empty()) &&
!any_of(
VectorizableTree, [&TE](const std::unique_ptr<TreeEntry> &Entry) {
return Entry->State == TreeEntry::NeedToGather &&
Entry.get() != TE.get() && Entry->isSame(TE->Scalars);
})) {
return;
}
if (TE->State == TreeEntry::NeedToGather) {
if (TE->getOpcode() == Instruction::ExtractElement &&
!TE->isAltShuffle() &&
isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
->getVectorOperandType()) &&
allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
// Check that gather of extractelements can be represented as
// just a shuffle of a single vector with a single user only.
OrdersType CurrentOrder;
bool Reuse =
canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
if ((Reuse || !CurrentOrder.empty()) &&
!any_of(VectorizableTree,
[&TE](const std::unique_ptr<TreeEntry> &Entry) {
return Entry->State == TreeEntry::NeedToGather &&
Entry.get() != TE.get() &&
Entry->isSame(TE->Scalars);
})) {
OrderedEntries.insert(TE.get());
GathersToOrders.try_emplace(TE.get(), CurrentOrder);
return;
}
}
if (Optional<OrdersType> CurrentOrder =
findReusedOrderedScalars(*TE.get())) {
OrderedEntries.insert(TE.get());
GathersToOrders.try_emplace(TE.get(), CurrentOrder);
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
}
if (TE->State != TreeEntry::Vectorize)
NonVectorized.push_back(TE.get());
});

// Checks if the operands of the users are reordarable and have only single
Expand Down Expand Up @@ -2911,7 +3004,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
(TE->State == TreeEntry::NeedToGather &&
TE->getOpcode() == Instruction::ExtractElement)) ||
GathersToOrders.count(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
!all_of(drop_begin(TE->UserTreeIndices),
[TE](const EdgeInfo &EI) {
Expand Down Expand Up @@ -2989,7 +3082,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Choose the best order.
ArrayRef<unsigned> BestOrder = OrdersUses.begin()->first;
unsigned Cnt = OrdersUses.begin()->second;
for (const auto &Pair : llvm::drop_begin(OrdersUses)) {
for (const auto &Pair : drop_begin(OrdersUses)) {
if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
BestOrder = Pair.first;
Cnt = Pair.second;
Expand Down Expand Up @@ -3032,10 +3125,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// For gathers just need to reorder its scalars.
for (TreeEntry *Gather : GatherOps) {
if (!Gather->ReuseShuffleIndices.empty())
continue;
assert(Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.");
if (!Gather->ReuseShuffleIndices.empty()) {
// Just reorder reuses indices.
reorderReuses(Gather->ReuseShuffleIndices, Mask);
continue;
}
reorderScalars(Gather->Scalars, Mask);
OrderedEntries.remove(Gather);
}
Expand Down Expand Up @@ -7369,9 +7465,7 @@ struct SLPVectorizer : public FunctionPass {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}

bool doInitialization(Module &M) override {
return false;
}
bool doInitialization(Module &M) override { return false; }

bool runOnFunction(Function &F) override {
if (skipFunction(F))
Expand Down
Expand Up @@ -32,21 +32,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {

define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-LABEL: @store_chain_v2i64(
; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8
; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8
; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8
; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>*
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
Expand Down
28 changes: 13 additions & 15 deletions llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
Expand Up @@ -32,21 +32,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {

define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-LABEL: @store_chain_v2i64(
; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8
; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8
; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8
; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>*
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
Expand Down
15 changes: 8 additions & 7 deletions llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
Expand Up @@ -69,22 +69,23 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]]
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
Expand Down

0 comments on commit 64d1617

Please sign in to comment.