Skip to content

Commit

Permalink
[SLP]Do not emit extract elements for insertelements users, replace w…
Browse files Browse the repository at this point in the history
…ith shuffles directly.

SLP vectorizer emits extracts for externally used vectorized scalars and
estimates the cost for each such extract. But in many cases these
scalars are input for insertelement instructions, forming buildvector,
and instead of extractelement/insertelement pair we can emit/cost
estimate shuffle(s) cost and generate series of shuffles, which can be
further optimized.

Tested using test-suite (+SPEC2017), the tests passed, SLP was able to
generate/vectorize more instructions in many cases and it allowed to reduce
number of re-vectorization attempts (where we could try to vectorize
buildector insertelements again and again).

Differential Revision: https://reviews.llvm.org/D107966
  • Loading branch information
alexey-bataev committed May 23, 2022
1 parent 3e0be56 commit 2ac5ebe
Show file tree
Hide file tree
Showing 35 changed files with 479 additions and 284 deletions.
308 changes: 308 additions & 0 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -6668,6 +6668,23 @@ static bool isFirstInsertElement(const InsertElementInst *IE1,
llvm_unreachable("Two different buildvectors not expected.");
}

namespace {
/// Returns incoming Value *, if the requested type is Value * too, or a default
/// value, otherwise.
struct ValueSelect {
template <typename U>
static typename std::enable_if<std::is_same<Value *, U>::value, Value *>::type
get(Value *V) {
return V;
}
template <typename U>
static typename std::enable_if<!std::is_same<Value *, U>::value, U>::type
get(Value *) {
return U();
}
};
} // namespace

/// Does the analysis of the provided shuffle masks and performs the requested
/// actions on the vectors with the given shuffle masks. It tries to do it in
/// several steps.
Expand Down Expand Up @@ -6700,6 +6717,9 @@ static T *performExtractsShuffleAction(
else
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
}
auto *V = ValueSelect::get<T *>(Base);
assert((!V || GetVF(V) == Mask.size()) &&
"Expected base vector of VF number of elements.");
Prev = Action(Mask, {nullptr, Res.first});
} else if (ShuffleMask.size() == 1) {
// Base is undef and only 1 vector is shuffled - perform the action only for
Expand Down Expand Up @@ -8105,6 +8125,17 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues);
}

namespace {
/// Data type for handling buildvector sequences with the reused scalars from
/// other tree entries.
struct ShuffledInsertData {
/// List of insertelements to be replaced by shuffles.
SmallVector<InsertElementInst *> InsertElements;
/// The parent vectors and shuffle mask for the given list of inserts.
MapVector<Value *, SmallVector<int>> ValueMasks;
};
} // namespace

Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted.
Expand Down Expand Up @@ -8138,6 +8169,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");

SmallVector<ShuffledInsertData> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
Expand Down Expand Up @@ -8177,6 +8211,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?");
auto *IE = cast<InsertElementInst>(Scalar);
VectorToInsertElement.try_emplace(Vec, IE);
return Vec;
};
// If User == nullptr, the Scalar is used as extra arg. Generate
Expand Down Expand Up @@ -8205,6 +8241,65 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
}

if (auto *VU = dyn_cast<InsertElementInst>(User)) {
// Skip if the scalar is another vector op or Vec is not an instruction.
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
Optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
auto *It =
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
// Checks if 2 insertelements are from the same buildvector.
InsertElementInst *VecInsert = Data.InsertElements.front();
return areTwoInsertFromSameBuildVector(VU, VecInsert);
});
unsigned Idx = *InsertIdx;
if (It == ShuffledInserts.end()) {
(void)ShuffledInserts.emplace_back();
It = std::next(ShuffledInserts.begin(),
ShuffledInserts.size() - 1);
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), UndefMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
if (IEBase != User &&
(!IEBase->hasOneUse() ||
getInsertIndex(IEBase).getValueOr(Idx) == Idx))
break;
// Build the mask for the vectorized insertelement instructions.
if (const TreeEntry *E = getTreeEntry(IEBase)) {
do {
IEBase = cast<InsertElementInst>(Base);
int IEIdx = *getInsertIndex(IEBase);
assert(Mask[Idx] == UndefMaskElem &&
"InsertElementInstruction used already.");
Mask[IEIdx] = IEIdx;
Base = IEBase->getOperand(0);
} while (E == getTreeEntry(Base));
break;
}
Base = cast<InsertElementInst>(Base)->getOperand(0);
// After the vectorization the def-use chain has changed, need
// to look through original insertelement instructions, if they
// get replaced by vector instructions.
auto It = VectorToInsertElement.find(Base);
if (It != VectorToInsertElement.end())
Base = It->second;
}
}
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), UndefMaskElem);
Mask[Idx] = ExternalUse.Lane;
It->InsertElements.push_back(cast<InsertElementInst>(User));
continue;
}
}
}
}

// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
Expand Down Expand Up @@ -8240,6 +8335,219 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}

// Checks if the mask is an identity mask.
auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
int Limit = Mask.size();
return VecTy->getNumElements() == Mask.size() &&
all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
ShuffleVectorInst::isIdentityMask(Mask);
};
// Tries to combine 2 different masks into single one.
auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
if (ExtMask[I] == UndefMaskElem)
continue;
NewMask[I] = Mask[ExtMask[I]];
}
Mask.swap(NewMask);
};
// Peek through shuffles, trying to simplify the final shuffle code.
auto &&PeekThroughShuffles =
[&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
bool CheckForLengthChange = false) {
while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
// Exit if not a fixed vector type or changing size shuffle.
if (!isa<FixedVectorType>(SV->getType()) ||
(CheckForLengthChange && SV->changesLength()))
break;
// Exit if the identity or broadcast mask is found.
if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
SV->isZeroEltSplat())
break;
bool IsOp1Undef = isUndefVector(SV->getOperand(0));
bool IsOp2Undef = isUndefVector(SV->getOperand(1));
if (!IsOp1Undef && !IsOp2Undef)
break;
SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
SV->getShuffleMask().end());
CombineMasks(ShuffleMask, Mask);
Mask.swap(ShuffleMask);
if (IsOp2Undef)
V = SV->getOperand(0);
else
V = SV->getOperand(1);
}
};
// Smart shuffle instruction emission, walks through shuffles trees and
// tries to find the best matching vector for the actual shuffle
// instruction.
auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
&CombineMasks](Value *V1, Value *V2,
ArrayRef<int> Mask) -> Value * {
assert(V1 && "Expected at least one vector value.");
if (V2 && !isUndefVector(V2)) {
// Peek through shuffles.
Value *Op1 = V1;
Value *Op2 = V2;
int VF =
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (Mask[I] < VF)
CombinedMask1[I] = Mask[I];
else
CombinedMask2[I] = Mask[I] - VF;
}
Value *PrevOp1;
Value *PrevOp2;
do {
PrevOp1 = Op1;
PrevOp2 = Op2;
PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
// Check if we have 2 resizing shuffles - need to peek through operands
// again.
if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))
if (SV1->getOperand(0)->getType() ==
SV2->getOperand(0)->getType() &&
SV1->getOperand(0)->getType() != SV1->getType() &&
isUndefVector(SV1->getOperand(1)) &&
isUndefVector(SV2->getOperand(1))) {
Op1 = SV1->getOperand(0);
Op2 = SV2->getOperand(0);
SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
SV1->getShuffleMask().end());
CombineMasks(ShuffleMask1, CombinedMask1);
CombinedMask1.swap(ShuffleMask1);
SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
SV2->getShuffleMask().end());
CombineMasks(ShuffleMask2, CombinedMask2);
CombinedMask2.swap(ShuffleMask2);
}
} while (PrevOp1 != Op1 || PrevOp2 != Op2);
VF = cast<VectorType>(Op1->getType())
->getElementCount()
.getKnownMinValue();
for (int I = 0, E = Mask.size(); I < E; ++I) {
if (CombinedMask2[I] != UndefMaskElem) {
assert(CombinedMask1[I] == UndefMaskElem &&
"Expected undefined mask element");
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
}
}
Value *Vec = Builder.CreateShuffleVector(
Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
CombinedMask1);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherShuffleSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
return Vec;
}
if (isa<PoisonValue>(V1))
return PoisonValue::get(FixedVectorType::get(
cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
Value *Op = V1;
SmallVector<int> CombinedMask(Mask.begin(), Mask.end());
PeekThroughShuffles(Op, CombinedMask);
if (!isa<FixedVectorType>(Op->getType()) ||
!IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherShuffleSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
return Vec;
}
return Op;
};

auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask) {
unsigned VF = Mask.size();
unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
if (VF != VecVF) {
if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
Vec = CreateShuffle(Vec, nullptr, Mask);
return std::make_pair(Vec, true);
}
SmallVector<int> ResizeMask(VF, UndefMaskElem);
for (unsigned I = 0; I < VF; ++I) {
if (Mask[I] != UndefMaskElem)
ResizeMask[Mask[I]] = Mask[I];
}
Vec = CreateShuffle(Vec, nullptr, ResizeMask);
}

return std::make_pair(Vec, false);
};
// Perform shuffling of the vectorize tree entries for better handling of
// external extracts.
for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
// Find the first and the last instruction in the list of insertelements.
sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
Builder.SetInsertPoint(LastInsert);
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
Value *NewInst = performExtractsShuffleAction<Value>(
makeMutableArrayRef(Vector.data(), Vector.size()),
FirstInsert->getOperand(0),
[](Value *Vec) {
return cast<VectorType>(Vec->getType())
->getElementCount()
.getKnownMinValue();
},
ResizeToVF,
[FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
ArrayRef<Value *> Vals) {
assert((Vals.size() == 1 || Vals.size() == 2) &&
"Expected exactly 1 or 2 input values.");
if (Vals.size() == 1) {
// Do not create shuffle if the mask is a simple identity
// non-resizing mask.
if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
->getNumElements() ||
!ShuffleVectorInst::isIdentityMask(Mask))
return CreateShuffle(Vals.front(), nullptr, Mask);
return Vals.front();
}
return CreateShuffle(Vals.front() ? Vals.front()
: FirstInsert->getOperand(0),
Vals.back(), Mask);
});
auto It = ShuffledInserts[I].InsertElements.rbegin();
// Rebuild buildvector chain.
InsertElementInst *II = nullptr;
if (It != ShuffledInserts[I].InsertElements.rend())
II = *It;
SmallVector<Instruction *> Inserts;
while (It != ShuffledInserts[I].InsertElements.rend()) {
assert(II && "Must be an insertelement instruction.");
if (*It == II)
++It;
else
Inserts.push_back(cast<Instruction>(II));
II = dyn_cast<InsertElementInst>(II->getOperand(0));
}
for (Instruction *II : reverse(Inserts)) {
II->replaceUsesOfWith(II->getOperand(0), NewInst);
if (auto *NewI = dyn_cast<Instruction>(NewInst))
if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
II->moveAfter(NewI);
NewInst = II;
}
LastInsert->replaceAllUsesWith(NewInst);
for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
IE->replaceUsesOfWith(IE->getOperand(1),
PoisonValue::get(IE->getOperand(1)->getType()));
eraseInstruction(IE);
}
CSEBlocks.insert(LastInsert->getParent());
}

// For each vectorized value:
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
Expand Up @@ -1312,19 +1312,19 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1,
; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]]
; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 6, i32 2, i32 10, i32 14, i32 5, i32 1, i32 9, i32 13, i32 4, i32 0, i32 8, i32 12>
; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 7, i32 3, i32 11, i32 15, i32 4, i32 0, i32 8, i32 12, i32 5, i32 1, i32 9, i32 13>
; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> <i32 5, i32 4, i32 6, i32 7, i32 1, i32 0, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]]
; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]]
; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 1, i32 0, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]]
; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]]
; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30>
; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]]
; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]]
; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 2, i32 19, i32 0, i32 17, i32 23, i32 6, i32 5, i32 20, i32 27, i32 10, i32 9, i32 24, i32 31, i32 14, i32 13, i32 28>
; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]]
; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]]
; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
Expand Down

0 comments on commit 2ac5ebe

Please sign in to comment.