Skip to content

Commit

Permalink
[SLP]Attempt to vectorize long stores, if short one failed.
Browse files Browse the repository at this point in the history
We can try to vectorize long store sequences, if short ones were
unsuccessful because of the non-profitable vectorization. It should not
increase compile time significantly (stores are sorted already,
complexity is n x log n), but vectorize extra code.

Metric: size..text

Program                                                                         size..text
                                                                                results     results0    diff
         test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  1088012.00  1088236.00  0.0%
                  test-suite :: SingleSource/UnitTests/matrix-types-spec.test   480396.00   480476.00  0.0%
          test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   664613.00   664661.00  0.0%
         test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   664613.00   664661.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  2041105.00  2040961.00 -0.0%
                 test-suite :: MultiSource/Applications/JM/lencod/lencod.test   836563.00   836387.00 -0.0%
                 test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1035100.00  1032140.00 -0.3%

In all benchmarks extra code gets vectorized

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: #88563
  • Loading branch information
alexey-bataev committed Apr 17, 2024
1 parent 693a458 commit 6f7160e
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 72 deletions.
106 changes: 69 additions & 37 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15164,10 +15164,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;

// Stores the pair of stores (first_store, last_store) in a range, that were
// already tried to be vectorized. Allows to skip the store ranges that were
// already tried to be vectorized but the attempts were unsuccessful.
DenseSet<std::pair<Value *, Value *>> TriedSequences;
struct StoreDistCompare {
bool operator()(const std::pair<unsigned, int> &Op1,
const std::pair<unsigned, int> &Op2) const {
Expand Down Expand Up @@ -15209,8 +15205,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
Type *ValueTy = StoreTy;
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
ValueTy = Trunc->getSrcTy();
unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
unsigned MinVF = std::max<unsigned>(
2, PowerOf2Ceil(TTI->getStoreMinimumVF(
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
ValueTy)));

if (MaxVF < MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
Expand All @@ -15236,40 +15234,74 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
VF = Size > MaxVF ? NonPowerOf2VF : Size;
Size *= 2;
});
unsigned StartIdx = 0;
for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(
all_of(
Slice,
[&](Value *V) {
return cast<StoreInst>(V)->getValueOperand()->getType() ==
cast<StoreInst>(Slice.front())
->getValueOperand()
->getType();
}) &&
"Expected all operands of same type.");
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
.second &&
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
// If we vectorized initial block, no need to try to vectorize it
// again.
if (Cnt == StartIdx)
StartIdx += Size;
Cnt += Size;
continue;
unsigned End = Operands.size();
unsigned Repeat = 0;
constexpr unsigned MaxAttempts = 2;
SmallBitVector Range(Operands.size());
while (true) {
++Repeat;
for (unsigned Size : CandidateVFs) {
int StartIdx = Range.find_first_unset();
while (StartIdx != -1) {
int EndIdx = Range.find_next(StartIdx);
unsigned Sz = EndIdx == -1 ? End : EndIdx;
for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(all_of(Slice,
[&](Value *V) {
return cast<StoreInst>(V)
->getValueOperand()
->getType() ==
cast<StoreInst>(Slice.front())
->getValueOperand()
->getType();
}) &&
"Expected all operands of same type.");
if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them
// again.
VectorizedStores.insert(Slice.begin(), Slice.end());
// Mark the vectorized stores so that we don't vectorize them
// again.
Changed = true;
// If we vectorized initial block, no need to try to vectorize
// it again.
Range.set(Cnt, Cnt + Size);
if (Cnt < StartIdx + MinVF)
Range.set(StartIdx, Cnt);
if (Cnt > EndIdx - Size - MinVF) {
Range.set(Cnt + Size, EndIdx);
End = Cnt;
}
Cnt += Size;
continue;
}
++Cnt;
}
if (Sz >= End)
break;
StartIdx = Range.find_next_unset(EndIdx);
}
++Cnt;
}
// Check if the whole array was vectorized already - exit.
if (StartIdx >= Operands.size())
// All values vectorize - exit.
if (Range.all())
break;
// Check if tried all attempts or no need for the last attempts at all.
if (Repeat >= MaxAttempts)
break;
constexpr unsigned MaxVFScale = 4;
constexpr unsigned StoresLimit = 16;
const unsigned MaxTotalNum = std::min(
std::max<unsigned>(StoresLimit, MaxVFScale * MaxVF),
bit_floor(static_cast<unsigned>(Range.find_last_unset() -
Range.find_first_unset() + 1)));
if (MaxVF >= MaxTotalNum)
break;
// Last attempt to vectorize max number of elements, if all previous
// attempts were unsuccessful because of the cost issues.
CandidateVFs.clear();
for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2)
CandidateVFs.push_back(Size);
}
}
};
Expand Down
46 changes: 11 additions & 35 deletions llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
; SSE-LABEL: @store_i64(
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: ret void
;
; AVX-LABEL: @store_i64(
Expand Down

0 comments on commit 6f7160e

Please sign in to comment.