diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8ae38550d3095..7694627c3b043 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15237,60 +15237,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, Size *= 2; }); unsigned StartIdx = 0; - unsigned Repeat = 0; - constexpr unsigned MaxAttempts = 2; - while (true) { - ++Repeat; - for (unsigned Size : CandidateVFs) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - assert( - all_of( - Slice, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Slice.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - TriedSequences - .insert(std::make_pair(Slice.front(), Slice.back())) - .second && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them - // again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize - // it again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; - } - ++Cnt; - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) { - Repeat = MaxAttempts; - break; + for (unsigned Size : CandidateVFs) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + assert( + all_of( + Slice, + [&](Value *V) { + return cast(V)->getValueOperand()->getType() == + cast(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) + .second && + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; } + ++Cnt; } - // Check if tried all attempts or no need for the last attempts at all. - if (Repeat >= MaxAttempts) - break; - const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx); - if (MaxVF >= MaxTotalNum) + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) break; - // Last attempt to vectorize max number of elements, if all previous - // attempts were unsuccessful because of the cost issues. - CandidateVFs.clear(); - for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) { - CandidateVFs.push_back(Size); - } } } }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 3deab0975ce76..75505f632a43f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -100,17 +100,41 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { ; SSE-LABEL: @store_i64( ; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer -; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], -; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64> -; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 +; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @store_i64(