Skip to content

Commit

Permalink
[COST]Improve cost model for shuffles in SLP.
Browse files Browse the repository at this point in the history
Introduced masks where they are not added and improved target dependent
cost models to avoid returning of the incorrect cost results after
adding masks.

Differential Revision: https://reviews.llvm.org/D100486
  • Loading branch information
alexey-bataev committed Apr 27, 2022
1 parent afd6390 commit 29a470e
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 151 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/Analysis/VectorUtils.h
Expand Up @@ -413,7 +413,7 @@ bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
void processShuffleMasks(
ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
function_ref<void(ArrayRef<int>, unsigned)> SingleInputAction,
function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);

/// Compute a map of integer instructions to their minimum legal type
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Analysis/VectorUtils.cpp
Expand Up @@ -499,7 +499,7 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
void llvm::processShuffleMasks(
ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
function_ref<void(ArrayRef<int>, unsigned)> SingleInputAction,
function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction) {
SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
// Try to perform better estimation of the permutation.
Expand Down Expand Up @@ -543,7 +543,7 @@ void llvm::processShuffleMasks(
auto *It =
find_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
unsigned SrcReg = std::distance(Dest.begin(), It);
SingleInputAction(*It, SrcReg);
SingleInputAction(*It, SrcReg, I);
break;
}
default: {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
Expand Up @@ -2523,7 +2523,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
/*NumOfUsedRegs=*/1,
[&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); },
[&Output, &DAG = DAG, NewVT, &DL, &Inputs,
&BuildVector](ArrayRef<int> Mask, unsigned Idx) {
&BuildVector](ArrayRef<int> Mask, unsigned Idx, unsigned /*Unused*/) {
if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR)
Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask);
else
Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Expand Up @@ -1224,6 +1224,57 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());

if (!Mask.empty() && NumOfDests.isValid()) {
// Try to perform better estimation of the permutation.
// 1. Split the source/destination vectors into real registers.
// 2. Do the mask analysis to identify which real registers are
// permuted. If more than 1 source registers are used for the
// destination register building, the cost for this destination register
// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
// source register is used, build mask and calculate the cost as a cost
// of PermuteSingleSrc.
// Also, for the single register permute we try to identify if the
// destination register is just a copy of the source register or the
// copy of the previous destination register (the cost is
// TTI::TCC_Basic). If the source register is just reused, the cost for
// this operation is 0.
unsigned E = *NumOfDests.getValue();
unsigned PrevSrcReg = 0;
ArrayRef<int> PrevRegMask;
InstructionCost Cost = 0;
processShuffleMasks(
Mask, NumOfSrcs, E, E, []() {},
[this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
&Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
// Check if the previous register can be just copied to the next
// one.
if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
PrevRegMask != RegMask)
Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
RegMask, 0, nullptr);
else
// Just a copy of previous destination register.
Cost += TTI::TCC_Basic;
return;
}
if (SrcReg != DestReg &&
any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
// Just a copy of the source register.
Cost += TTI::TCC_Basic;
}
PrevSrcReg = SrcReg;
PrevRegMask = RegMask;
},
[this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
unsigned /*Unused*/,
unsigned /*Unused*/) {
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
0, nullptr);
});
return Cost;
}

InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
None, 0, nullptr);
Expand Down
35 changes: 23 additions & 12 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -5059,6 +5059,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a shuffle
// to extract the values into a vector register.
SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
for (auto *V : VL) {
++Idx;

Expand All @@ -5068,6 +5069,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
RegMask.assign(EltsPerVector, UndefMaskElem);
AllConsecutive = true;
continue;
}
Expand All @@ -5079,6 +5081,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
CurrentIdx % EltsPerVector == Idx % EltsPerVector;
RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
}

if (AllConsecutive)
Expand All @@ -5093,7 +5096,7 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
// cost to extract the a vector with EltsPerVector elements.
Cost += TTI.getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc,
FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
}
return Cost;
}
Expand Down Expand Up @@ -5880,16 +5883,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
TTI::CastContextHint::None, CostKind);
}

SmallVector<int> Mask;
buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
},
Mask);
CommonCost =
TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
if (E->ReuseShuffleIndices.empty()) {
CommonCost =
TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
} else {
SmallVector<int> Mask;
buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
return I->getOpcode() == E->getAltOpcode();
},
Mask);
CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
FinalVecTy, Mask);
}
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
return CommonCost + VecCost - ScalarCost;
}
Expand Down Expand Up @@ -6278,7 +6286,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost += SpillCost + ExtractCost;
if (FirstUsers.size() == 1) {
int Limit = ShuffleMask.front().size() * 2;
if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
if (!all_of(ShuffleMask.front(),
[Limit](int Idx) { return Idx < Limit; }) ||
!ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
InstructionCost C = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc,
Expand Down Expand Up @@ -6327,6 +6336,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
<< "SLP: Current total cost = " << Cost << "\n");
Cost -= InsertCost;
for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
if (ShuffleMask[I].empty())
continue;
// Other elements - permutation of 2 vectors (the initial one and the
// next Ith incoming vector).
unsigned VF = ShuffleMask[I].size();
Expand Down

0 comments on commit 29a470e

Please sign in to comment.