Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 106 additions & 29 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1241,46 +1241,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
//
// We assume that shuffling at a register granularity can be done for free.
// This is not true for vectors fed into memory instructions, but it is
// effectively true for all other shuffling. The emphasis of the logic here
// is to assist generic transform in cleaning up / canonicalizing those
// shuffles.

// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
unsigned NumSrcElts = SrcVecTy->getNumElements();
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
(Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
Kind == TTI::SK_PermuteSingleSrc))
return 0;
}

unsigned EltsPerReg = 32 / ScalarSize;
if (RequestedElts == 0)
return 0;
switch (Kind) {
case TTI::SK_Broadcast:
// A single v_perm_b32 can be re-used for all destination registers.
return 1;
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}
// One instruction per register.
if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
return InstructionCost::getInvalid();
case TTI::SK_ExtractSubvector:
if (Index % EltsPerReg == 0)
return 0; // Shuffling at register granularity
if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
return InstructionCost::getInvalid();
case TTI::SK_InsertSubvector: {
// Even aligned accesses are free
if (!(Index % 2))
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
if (!DstVecTy)
return InstructionCost::getInvalid();
unsigned NumDstElts = DstVecTy->getNumElements();
unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
unsigned EndIndex = Index + NumInsertElts;
unsigned BeginSubIdx = Index % EltsPerReg;
unsigned EndSubIdx = EndIndex % EltsPerReg;
unsigned Cost = 0;

if (BeginSubIdx != 0) {
// Need to shift the inserted vector into place. The cost is the number
// of destination registers overlapped by the inserted vector.
Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
}

// If the last register overlap is partial, there may be three source
// registers feeding into it; that takes an extra instruction.
if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
Cost += 1;

return Cost;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
case TTI::SK_Splice: {
auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
if (!DstVecTy)
return InstructionCost::getInvalid();
unsigned NumElts = DstVecTy->getNumElements();
assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
// Determine the sub-region of the result vector that requires
// sub-register shuffles / mixing.
unsigned EltsFromLHS = NumElts - Index;
bool LHSIsAligned = (Index % EltsPerReg) == 0;
bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
if (LHSIsAligned && RHSIsAligned)
return 0;
if (LHSIsAligned && !RHSIsAligned)
return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
if (!LHSIsAligned && RHSIsAligned)
return divideCeil(EltsFromLHS, EltsPerReg);
return divideCeil(NumElts, EltsPerReg);
}

default:
break;
}

if (!Mask.empty()) {
unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();

// Generically estimate the cost by assuming that each destination
// register is derived from sources via v_perm_b32 instructions if it
// can't be copied as-is.
//
// For each destination register, derive the cost of obtaining it based
// on the number of source registers that feed into it.
unsigned Cost = 0;
for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
SmallVector<int, 4> Regs;
bool Aligned = true;
for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
int SrcIdx = Mask[DstIdx + I];
if (SrcIdx == -1)
continue;
int Reg;
if (SrcIdx < (int)NumSrcElts) {
Reg = SrcIdx / EltsPerReg;
if (SrcIdx % EltsPerReg != I)
Aligned = false;
} else {
Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
Aligned = false;
}
if (!llvm::is_contained(Regs, Reg))
Regs.push_back(Reg);
}
if (Regs.size() >= 2)
Cost += Regs.size() - 1;
else if (!Aligned)
Cost += 1;
}
return Cost;
}
}

return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
Expand Down
168 changes: 168 additions & 0 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class VectorCombine {
bool foldShuffleOfSelects(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
bool foldShuffleOfShuffles(Instruction &I);
bool foldShufflesOfLengthChangingShuffles(Instruction &I);
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
Expand Down Expand Up @@ -2877,6 +2878,171 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
return true;
}

/// Try to convert a chain of length-preserving shuffles that are fed by
/// length-changing shuffles from the same source, e.g. a chain of length 3:
///
/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
/// (shuffle y, undef)),
// (shuffle y, undef)"
///
/// into a single shuffle fed by a length-changing shuffle:
///
/// "shuffle x, (shuffle y, undef)"
///
/// Such chains arise e.g. from folding extract/insert sequences.
bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
unsigned ChainLength = 0;
SmallVector<int> Mask;
SmallVector<int> YMask;
InstructionCost OldCost = 0;
InstructionCost NewCost = 0;
FixedVectorType *TrunkType = cast<FixedVectorType>(I.getType());
Value *Trunk = &I;
unsigned NumTrunkElts = TrunkType->getNumElements();
FixedVectorType *YType = nullptr;
Value *Y = nullptr;

for (;;) {
// Match the current trunk against (commutations of) the pattern
// "shuffle trunk', (shuffle y, undef)"
ArrayRef<int> OuterMask;
Value *OuterV0, *OuterV1;
if (ChainLength != 0 && !Trunk->hasOneUse())
break;
if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
m_Mask(OuterMask))))
break;
if (OuterV0->getType() != TrunkType) {
// This shuffle is not length-preserving, so it cannot be part of the
// chain.
break;
}

ArrayRef<int> InnerMask0, InnerMask1;
Value *A0, *A1, *B0, *B1;
bool Match0 =
match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
bool Match1 =
match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
bool Match0Leaf = Match0 && A0->getType() != I.getType();
bool Match1Leaf = Match1 && A1->getType() != I.getType();
if (Match0Leaf == Match1Leaf) {
// Only handle the case of exactly one leaf in each step. The "two leaves"
// case is handled by foldShuffleOfShuffles.
break;
}

SmallVector<int> CommutedOuterMask;
if (Match0Leaf) {
std::swap(OuterV0, OuterV1);
std::swap(InnerMask0, InnerMask1);
std::swap(A0, A1);
std::swap(B0, B1);
llvm::append_range(CommutedOuterMask, OuterMask);
for (int &M : CommutedOuterMask) {
if (M == PoisonMaskElem)
continue;
if (M < (int)NumTrunkElts)
M += NumTrunkElts;
else
M -= NumTrunkElts;
}
OuterMask = CommutedOuterMask;
}
if (!OuterV1->hasOneUse())
break;

if (!isa<PoisonValue>(A1)) {
if (!Y)
Y = A1;
else if (Y != A1)
break;
}
if (!isa<PoisonValue>(B1)) {
if (!Y)
Y = B1;
else if (Y != B1)
break;
}

InstructionCost LocalOldCost =
TTI.getInstructionCost(cast<User>(Trunk), CostKind) +
TTI.getInstructionCost(cast<User>(OuterV1), CostKind);

// Handle the initial (start of chain) case.
if (!ChainLength) {
YType = cast<FixedVectorType>(A1->getType());
Mask.assign(OuterMask);
YMask.assign(InnerMask1);
OldCost = NewCost = LocalOldCost;
Trunk = OuterV0;
ChainLength++;
continue;
}

// For the non-root case, first attempt to combine masks.
SmallVector<int> NewYMask(YMask);
bool Valid = true;
for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, InnerMask1)) {
if (LeafM == -1 || CombinedM == LeafM)
continue;
if (CombinedM == -1) {
CombinedM = LeafM;
} else {
Valid = false;
break;
}
}
if (!Valid)
break;

SmallVector<int> NewMask;
NewMask.reserve(NumTrunkElts);
for (int M : Mask) {
if (M < 0 || M >= (int)NumTrunkElts)
NewMask.push_back(M);
else
NewMask.push_back(OuterMask[M]);
}

// Break the chain if adding this new step complicates the shuffles such
// that it would increase the new cost by more than the old cost of this
// step.
InstructionCost LocalNewCost = 0;
LocalNewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
TrunkType, YType, NewYMask, CostKind);
LocalNewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
TrunkType, TrunkType, NewMask, CostKind);

if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
break;

LLVM_DEBUG({
if (ChainLength == 1) {
dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
<< I << '\n';
}
dbgs() << " next chain link: " << *Trunk << '\n'
<< " old cost: " << (OldCost + LocalOldCost)
<< " new cost: " << LocalNewCost << '\n';
});

Mask = NewMask;
YMask = NewYMask;
OldCost += LocalOldCost;
NewCost = LocalNewCost;
Trunk = OuterV0;
ChainLength++;
}
if (ChainLength <= 1)
return false;

Value *Leaf = Builder.CreateShuffleVector(Y, PoisonValue::get(YType), YMask);
Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
replaceValue(I, *Root);
return true;
}

/// Try to convert
/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
Expand Down Expand Up @@ -4718,6 +4884,8 @@ bool VectorCombine::run() {
return true;
if (foldShuffleOfShuffles(I))
return true;
if (foldShufflesOfLengthChangingShuffles(I))
return true;
if (foldShuffleOfIntrinsics(I))
return true;
if (foldSelectShuffle(I))
Expand Down
Loading
Loading