Skip to content

Commit

Permalink
Revert "[SLP]Add final resize to ShuffleCostEstimator::finalize membe…
Browse files Browse the repository at this point in the history
…r function and basic add member functions."

This reverts commit cd341f3 to fix
a crash revealed by buildbot https://lab.llvm.org/buildbot#builders/124/builds/7108.
  • Loading branch information
alexey-bataev committed Apr 18, 2023
1 parent c37c9f2 commit d7a40a4
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 107 deletions.
119 changes: 42 additions & 77 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -946,24 +946,18 @@ static bool isSimple(Instruction *I) {
}

/// Shuffles \p Mask in accordance with the given \p SubMask.
/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
/// one but two input vectors.
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
bool ExtendingManyInputs = false) {
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
if (SubMask.empty())
return;
assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) &&
"SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
return;
}
SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
if ((!ExtendingManyInputs &&
(SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) ||
SubMask[I] == UndefMaskElem)
if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
Mask[SubMask[I]] >= TermValue)
continue;
NewMask[I] = Mask[SubMask[I]];
}
Expand Down Expand Up @@ -6794,8 +6788,6 @@ class BaseShuffleAnalysis {
/// analysis/transformations.
class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
bool IsFinalized = false;
SmallVector<int> CommonMask;
SmallVector<Value *, 2> InVectors;
const TargetTransformInfo &TTI;
InstructionCost Cost = 0;
ArrayRef<Value *> VectorizedVals;
Expand Down Expand Up @@ -7017,53 +7009,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
InVectors.assign(1, VecBase);
return VecBase;
}
void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
CommonMask.assign(Mask.begin(), Mask.end());
InVectors.assign(
2, Constant::getNullValue(FixedVectorType::get(
E1->Scalars.front()->getType(),
std::max(E1->getVectorFactor(), E2->getVectorFactor()))));
}
void add(const TreeEntry *E1, ArrayRef<int> Mask) {
CommonMask.assign(Mask.begin(), Mask.end());
InVectors.assign(
1, Constant::getNullValue(FixedVectorType::get(
E1->Scalars.front()->getType(), E1->getVectorFactor())));
}
void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
if (!Root) {
assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
InVectors.assign(1, Constant::getNullValue(FixedVectorType::get(
VL.front()->getType(), VL.size())));
}
}
/// Finalize emission of the shuffles.
InstructionCost finalize(ArrayRef<int> ExtMask) {
InstructionCost finalize() {
IsFinalized = true;
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
if (CommonMask.empty())
return Cost;
int Limit = CommonMask.size() * 2;
if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
ShuffleVectorInst::isIdentityMask(CommonMask))
return Cost;
return Cost +
TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc
: TTI::SK_PermuteSingleSrc,
FixedVectorType::get(
cast<VectorType>(InVectors.front()->getType())
->getElementType(),
CommonMask.size()),
CommonMask);
return Cost;
}

~ShuffleCostEstimator() {
assert((IsFinalized || CommonMask.empty()) &&
"Shuffle construction must be finalized.");
assert(IsFinalized && "Shuffle construction must be finalized.");
}
};

Expand Down Expand Up @@ -7151,30 +7109,35 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (Mask[I] != UndefMaskElem)
GatheredScalars[I] = PoisonValue::get(ScalarTy);
}
LLVM_DEBUG(
int Limit = Mask.size() * 2;
if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously
// vectorized node. Cost is 0.
dbgs() << "SLP: perfect diamond match for gather bundle "
"that starts with "
<< *VL.front() << ".\n";
} else {
dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle that starts with " << *VL.front()
<< ".\n";
});
if (Entries.size() == 1)
Estimator.add(Entries.front(), Mask);
else
Estimator.add(Entries.front(), Entries.back(), Mask);
InstructionCost GatherCost = 0;
int Limit = Mask.size() * 2;
if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n");
if (NeedToShuffleReuses)
GatherCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
FinalVecTy, E->ReuseShuffleIndices);
} else {
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle that starts with "
<< *VL.front() << ".\n");
// Detected that instead of gather we can emit a shuffle of single/two
// previously vectorized nodes. Add the cost of the permutation rather
// than gather.
::addMask(Mask, E->ReuseShuffleIndices);
GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
}
Estimator.gather(
GatheredScalars,
Constant::getNullValue(FixedVectorType::get(
GatheredScalars.front()->getType(), GatheredScalars.size())));
return Estimator.finalize(E->ReuseShuffleIndices);
return GatherCost + Estimator.finalize();
}
if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
// Check that gather of extractelements can be represented as just a
Expand All @@ -7184,15 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
return Cost + Estimator.finalize(E->ReuseShuffleIndices);
}
Estimator.gather(
GatheredScalars,
(ExtractShuffle || GatherShuffle)
? Constant::getNullValue(FixedVectorType::get(
GatheredScalars.front()->getType(), GatheredScalars.size()))
: nullptr);
return Estimator.finalize(E->ReuseShuffleIndices);
if (NeedToShuffleReuses)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
FinalVecTy, E->ReuseShuffleIndices);
return Cost + Estimator.finalize();
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
ReuseShuffleCost = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
Estimator.gather(GatheredScalars);
return ReuseShuffleCost + Estimator.finalize();
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
Expand Down
44 changes: 14 additions & 30 deletions llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
Expand Up @@ -93,36 +93,20 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
}

define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
; SSE-LABEL: @logical_and_icmp_diff_preds(
; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]]
; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false
; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
; SSE-NEXT: ret i1 [[S3]]
;
; AVX-LABEL: @logical_and_icmp_diff_preds(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
; AVX-NEXT: ret i1 [[S3]]
; CHECK-LABEL: @logical_and_icmp_diff_preds(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
; CHECK-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
Expand Down

0 comments on commit d7a40a4

Please sign in to comment.