Skip to content

Commit

Permalink
[AArch64] Break up larger shuffle-masks into legal sizes in getShuffl…
Browse files Browse the repository at this point in the history
…eCost

Given a larger-than-legal shuffle mask, the final codegen will split
into multiple sub-vectors. This attempts to model that in
AArch64TTIImpl::getShuffleCost, splitting masks up according to the size
of the legalized vectors. If the sub-masks have at most 2 input sources
we can call getShuffleCost on them and sum the costs, to get a more
accurate final cost for the entire shuffle. The call to
improveShuffleKindFromMask helps to improve the shuffle kind for the
sub-mask cost call.

Differential Revision: https://reviews.llvm.org/D123414
  • Loading branch information
davemgreen committed Apr 27, 2022
1 parent d42f222 commit 8e2a0e6
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 13 deletions.
72 changes: 69 additions & 3 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Expand Up @@ -16,8 +16,8 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
Expand Down Expand Up @@ -2596,15 +2596,81 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
ArrayRef<int> Mask, int Index,
VectorType *SubTp,
ArrayRef<const Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// If we have a Mask, and the LT is being legalized somehow, split the Mask
// into smaller vectors and sum the cost of each shuffle.
if (!Mask.empty() && isa<FixedVectorType>(Tp) &&
Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
cast<FixedVectorType>(Tp)->getNumElements() >
LT.second.getVectorNumElements() &&
!Index && !SubTp) {
unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
unsigned LTNumElts = LT.second.getVectorNumElements();
unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
VectorType *NTp =
VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
InstructionCost Cost;
for (unsigned N = 0; N < NumVecs; N++) {
SmallVector<int> NMask;
// Split the existing mask into chunks of size LTNumElts. Track the source
// sub-vectors to ensure the result has at most 2 inputs.
unsigned Source1, Source2;
unsigned NumSources = 0;
for (unsigned E = 0; E < LTNumElts; E++) {
int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
: UndefMaskElem;
if (MaskElt < 0) {
NMask.push_back(UndefMaskElem);
continue;
}

// Calculate which source from the input this comes from and whether it
// is new to us.
unsigned Source = MaskElt / LTNumElts;
if (NumSources == 0) {
Source1 = Source;
NumSources = 1;
} else if (NumSources == 1 && Source != Source1) {
Source2 = Source;
NumSources = 2;
} else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
NumSources++;
}

// Add to the new mask. For the NumSources>2 case these are not correct,
// but are only used for the modular lane number.
if (Source == Source1)
NMask.push_back(MaskElt % LTNumElts);
else if (Source == Source2)
NMask.push_back(MaskElt % LTNumElts + LTNumElts);
else
NMask.push_back(MaskElt % LTNumElts);
}
// If the sub-mask has at most 2 input sub-vectors then re-cost it using
// getShuffleCost. If not then cost it using the worst case.
if (NumSources <= 2)
Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
: TTI::SK_PermuteTwoSrc,
NTp, NMask, 0, nullptr, Args);
else if (any_of(enumerate(NMask), [&](const auto &ME) {
return ME.value() % LTNumElts == ME.index();
}))
Cost += LTNumElts - 1;
else
Cost += LTNumElts;
}
return Cost;
}

Kind = improveShuffleKindFromMask(Kind, Mask);

// Check for broadcast loads.
if (Kind == TTI::SK_Broadcast) {
bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
if (IsLoad && LT.second.isVector() &&
isLegalBroadcastLoad(Tp->getElementType(),
LT.second.getVectorElementCount()))
LT.second.getVectorElementCount()))
return 0; // broadcast is handled by ld1r
}

Expand Down
20 changes: 10 additions & 10 deletions llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
Expand Up @@ -101,19 +101,19 @@ define void @insert_subvec() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
Expand Down Expand Up @@ -166,18 +166,18 @@ define void @multipart() {
; CHECK-LABEL: 'multipart'
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16b = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v64d = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64d = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64ab = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
Expand Down

0 comments on commit 8e2a0e6

Please sign in to comment.