Skip to content

Commit

Permalink
[SLP]Improve bottom-to-top reordering.
Browse files Browse the repository at this point in the history
Currently bottom-to-top reordering analysis counts orders of the
operands and then adds natural order counts for the operand users. It is
very conservative, this the user nodes themselves may require
reordering. Patch improves bottom-to-top analysis by checking for the
user nodes if they require/allows the reordring. If the user node must
be reordered, has reused scalars, is an alternate op vectorization node,
is a non-ordered gather node or may allow reordering because of the
reordered operands, such node is considered as the node that allows
reodring and is not counted as a node with the natural order.

Differential Revision: https://reviews.llvm.org/D120492
  • Loading branch information
alexey-bataev committed Feb 28, 2022
1 parent a64d3c6 commit e4b9640
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 58 deletions.
171 changes: 126 additions & 45 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -1904,6 +1904,41 @@ class BoUpSLP {
~BoUpSLP();

private:
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
/// user and reordarable).
/// \param NonVectorized List of all gather nodes that require reordering
/// (e.g., gather of extractlements or partially vectorizable loads).
/// \param GatherOps List of gather operand nodes for \p UserTE that require
/// reordering, subset of \p NonVectorized.
bool
canReorderOperands(TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps);

/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [this, &TE](Value *V) {
TE = getTreeEntry(V);
return TE;
});
if (It != VL.end() && TE->isSame(VL))
return TE;
return nullptr;
}

/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
unsigned OpIdx) const {
return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
const_cast<TreeEntry *>(UserTE), OpIdx);
}

/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;
Expand Down Expand Up @@ -3383,6 +3418,44 @@ void BoUpSLP::reorderTopToBottom() {
}
}

bool BoUpSLP::canReorderOperands(
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
OpData.second->State == TreeEntry::Vectorize;
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
// Do not reorder if operand node is used by many user nodes.
if (any_of(TE->UserTreeIndices,
[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
return false;
// Add the node to the list of the ordered nodes with the identity
// order.
Edges.emplace_back(I, TE);
continue;
}
ArrayRef<Value *> VL = UserTE->getOperand(I);
TreeEntry *Gather = nullptr;
if (count_if(ReorderableGathers, [VL, &Gather](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
"Only non-vectorized nodes are expected.");
if (TE->isSame(VL)) {
Gather = TE;
return true;
}
return false;
}) > 1)
return false;
if (Gather)
GatherOps.push_back(Gather);
}
return true;
}

void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SetVector<TreeEntry *> OrderedEntries;
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
Expand All @@ -3403,42 +3476,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
});

// Checks if the operands of the users are reordarable and have only single
// use.
auto &&CheckOperands =
[this, &NonVectorized](const auto &Data,
SmallVectorImpl<TreeEntry *> &GatherOps) {
for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
if (any_of(Data.second,
[I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
OpData.second->State == TreeEntry::Vectorize;
}))
continue;
ArrayRef<Value *> VL = Data.first->getOperand(I);
const TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [this, &TE](Value *V) {
TE = getTreeEntry(V);
return TE;
});
if (It != VL.end() && TE->isSame(VL))
return false;
TreeEntry *Gather = nullptr;
if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
"Only non-vectorized nodes are expected.");
if (TE->isSame(VL)) {
Gather = TE;
return true;
}
return false;
}) > 1)
return false;
if (Gather)
GatherOps.push_back(Gather);
}
return true;
};
// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
Expand Down Expand Up @@ -3475,10 +3512,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Erase filtered entries.
for_each(Filtered,
[&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
for (const auto &Data : Users) {
for (auto &Data : Users) {
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
if (!CheckOperands(Data, GatherOps)) {
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
GatherOps)) {
for_each(Data.second,
[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
OrderedEntries.remove(Op.second);
Expand All @@ -3494,6 +3532,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// the same node my be considered several times, though might be not
// profitable.
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
for (const auto &Op : Data.second) {
TreeEntry *OpTE = Op.second;
if (!VisitedOps.insert(OpTE).second)
Expand All @@ -3506,6 +3545,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return GathersToOrders.find(OpTE)->second;
return OpTE->ReorderIndices;
}();
unsigned NumOps = count_if(
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
});
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
Expand All @@ -3517,14 +3560,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
NumOps;
} else {
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
}
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
const TreeEntry *TE) {
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
(IgnoreReorder && TE->Idx == 0))
return true;
if (TE->State == TreeEntry::NeedToGather) {
auto It = GathersToOrders.find(TE);
if (It != GathersToOrders.end())
return !It->second.empty();
return true;
}
return false;
};
for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
TreeEntry *UserTE = EI.UserTE;
if (!VisitedUsers.insert(UserTE).second)
continue;
// May reorder user node if it requires reordering, has reused
// scalars, is an alternate op vectorize node or its op nodes require
// reordering.
if (AllowsReordering(UserTE))
continue;
// Check if users allow reordering.
// Currently look up just 1 level of operands to avoid increase of
// the compile time.
// Profitable to reorder if definitely more operands allow
// reordering rather than those with natural order.
ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
if (static_cast<unsigned>(count_if(
Ops, [UserTE, &AllowsReordering](
const std::pair<unsigned, TreeEntry *> &Op) {
return AllowsReordering(Op.second) &&
all_of(Op.second->UserTreeIndices,
[UserTE](const EdgeInfo &EI) {
return EI.UserTE == UserTE;
});
})) <= Ops.size() / 2)
++Res.first->second;
}
OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
OpTE->UserTreeIndices.size();
assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
--OrdersUses[{}];
}
// If no orders - skip current nodes and jump to the next one, if any.
if (OrdersUses.empty()) {
Expand Down Expand Up @@ -3565,7 +3646,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
OrderedEntries.remove(TE);
if (!VisitedOps.insert(TE).second)
continue;
if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
// Just reorder reuses indices.
reorderReuses(TE->ReuseShuffleIndices, Mask);
continue;
Expand Down
13 changes: 5 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
Expand Up @@ -18,19 +18,16 @@ define void @test(i32* %0, i32* %1, i32* %2) {
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
; CHECK-NEXT: [[TMP23:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP20]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[SHUFFLE2]]
; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[TMP23]], [[TMP22]]
; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[TMP18]]
; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer
; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer
; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i32> [[TMP29]], <4 x i32> [[TMP30]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
Expand Down
Expand Up @@ -7,21 +7,20 @@ define i32 @test(i32* %isec) {
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1
; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]]
; CHECK: block1:
; CHECK-NEXT: br i1 false, label [[BLOCK2:%.*]], label [[BLOCK3]]
; CHECK: block2:
; CHECK-NEXT: br label [[BLOCK3]]
; CHECK: block3:
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[SHUFFLE]], [[BLOCK1]] ], [ [[SHUFFLE]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: ret i32 [[TMP9]]
;
entry:
Expand Down

0 comments on commit e4b9640

Please sign in to comment.