diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 21a42c20b04f2..385cc54f19abe 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,47 @@ class BoUpSLP { /// may not be necessary. bool isLoadCombineCandidate(ArrayRef Stores) const; bool isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, int64_t Diff, size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// It is possible that the load gets "widened". Suppose that originally each load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is constant): + /// %b + 0 * %s + 0 + /// %b + 0 * %s + 1 + /// %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type ``. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate(const ArrayRef PointerOps, + Type *ElemTy, Align Alignment, + const SmallVectorImpl &SortedIndices, + const int64_t Diff, Value *Ptr0, Value *PtrN, + StridedPtrInfo &SPtrInfo) const; /// Return true if an array of scalar loads can be replaced with a strided /// load (with run-time stride). @@ -6844,12 +6883,7 @@ isMaskedLoadCompress(ArrayRef VL, ArrayRef PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); - if (Diff % (Sz - 1) != 0) - return false; - + Align Alignment, int64_t Diff, size_t Sz) const { // Try to generate strided load node. auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) { return isa(V) && any_of(V->users(), [&](User *U) { @@ -6870,29 +6904,110 @@ bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + } + return true; +} - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef PointerOps, Type *ElemTy, Align CommonAlignment, + const SmallVectorImpl &SortedIndices, const int64_t Diff, Value *Ptr0, + Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const unsigned Sz = PointerOps.size(); + SmallVector SortedOffsetsFromBase(Sz); + // Go through `PointerOps` in sorted order and record offsets from `Ptr0`. + for (unsigned I : seq(Sz)) { + Value *Ptr = + SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]]; + SortedOffsetsFromBase[I] = + *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE); + } + assert(SortedOffsetsFromBase.size() > 1 && + "Trying to generate strided load for less than 2 loads"); + // The code below checks that `SortedOffsetsFromBase` looks as follows: + // ``` + // [ + // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group + // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group + // ... + // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1, + // GroupSize - 1}), // last group + // ] + // ``` + // The distance between consecutive elements within each group should all be + // the same `StrideWithinGroup`. The distance between the first elements of + // consecutive groups should all be the same `StrideBetweenGroups`. + + int64_t StrideWithinGroup = + SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0]; + // Determine size of the first group. Later we will check that all other + // groups have the same size. + unsigned GroupSize = 1; + for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) { + if (SortedOffsetsFromBase[GroupSize] - + SortedOffsetsFromBase[GroupSize - 1] != + StrideWithinGroup) + break; + } + unsigned VecSz = Sz; + Type *ScalarTy = ElemTy; + int64_t StrideIntVal = StrideWithinGroup; + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); + + // Quick detour: at this point we can say what the type of strided load would + // be if all the checks pass. Check if this type is legal for the target. + if (Sz != GroupSize) { + if (Sz % GroupSize != 0) + return false; + VecSz = Sz / GroupSize; + + if (StrideWithinGroup != 1) + return false; + unsigned VecSz = Sz / GroupSize; + ScalarTy = Type::getIntNTy(SE->getContext(), + DL->getTypeSizeInBits(ElemTy).getFixedValue() * + GroupSize); + StridedLoadTy = getWidenedType(ScalarTy, VecSz); + if (!TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; + + // Continue with checking the "shape" of `SortedOffsetsFromBase`. + // Check that the strides between groups are all the same. + unsigned CurrentGroupStartIdx = GroupSize; + int64_t StrideBetweenGroups = + SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0]; + StrideIntVal = StrideBetweenGroups; + for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) { + if (SortedOffsetsFromBase[CurrentGroupStartIdx] - + SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] != + StrideBetweenGroups) + return false; } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; + + auto CheckGroup = [&](const unsigned StartIdx, const unsigned GroupSize0, + const int64_t StrideWithinGroup) -> bool { + unsigned GroupEndIdx = StartIdx + 1; + for (; GroupEndIdx != Sz; ++GroupEndIdx) { + if (SortedOffsetsFromBase[GroupEndIdx] - + SortedOffsetsFromBase[GroupEndIdx - 1] != + StrideWithinGroup) + break; + } + return GroupEndIdx - StartIdx == GroupSize0; + }; + for (unsigned I = 0; I < Sz; I += GroupSize) { + if (!CheckGroup(I, GroupSize, StrideWithinGroup)) + return false; } } - return false; + + if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, VecSz)) + return false; + + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal); + SPtrInfo.Ty = StridedLoadTy; + return true; } bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, @@ -6990,8 +7105,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -14902,11 +15017,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } break; case TreeEntry::StridedVectorize: { + const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E); + FixedVectorType *StridedLoadTy = SPtrInfo.Ty; + assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry."); Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); VecLdCost = TTI->getStridedMemoryOpCost( - Instruction::Load, VecTy, LI0->getPointerOperand(), + Instruction::Load, StridedLoadTy, LI0->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind); + if (StridedLoadTy != VecTy) + VecLdCost += + TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy, + getCastContextHint(*E), CostKind); + break; } case TreeEntry::CompressVectorize: { @@ -19670,6 +19793,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ? NewLI : ::propagateMetadata(NewLI, E->Scalars); + if (StridedLoadTy != VecTy) + V = Builder.CreateBitOrPointerCast(V, VecTy); V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 02e05b2e4138a..12725e7d46273 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s +; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) { ; CHECK-LABEL: define void @const_stride_1_no_reordering( @@ -621,22 +621,10 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) ; CHECK-LABEL: define void @constant_stride_widen_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0 -; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100 -; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200 -; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[GEP_S0]], align 1 ; CHECK-NEXT: ret void ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll index 82c940353ba5a..60d3b291b5dd4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll @@ -5,18 +5,19 @@ define i16 @test(ptr %i) { ; CHECK-LABEL: define i16 @test( ; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[GEP_US154:%.*]] = getelementptr i8, ptr [[I]], i64 132860 ; CHECK-NEXT: [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688 ; CHECK-NEXT: br label %[[FOR_COND5_US:.*]] ; CHECK: [[FOR_COND5_US]]: +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0) ; CHECK-NEXT: ret i16 [[TMP9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index b99a1c2d83394..dbcaafa9e5a8b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,13 +15,15 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <88 x float> @llvm.masked.load.v88f32.p0(ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), i32 16, <88 x i1> , <88 x float> poison) +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 8 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), i64 336, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll index 7bbc694dc5181..12cb86287ce6f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll @@ -4,12 +4,11 @@ define double @test() { ; CHECK-LABEL: define double @test() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> , double [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i64(ptr align 8 getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), i64 24, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index 1b11c3dcc081c..a577469a9aef0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -11,16 +11,10 @@ define void @foo() { ; SSE-LABEL: @foo( -; SSE-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; SSE-NEXT: store i32 [[TMP1]], ptr @a, align 16 -; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: store <8 x i32> [[TMP3]], ptr @a, align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index fde76f8b0e8b9..ca129e7ab97f8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: define void @gather_load_2( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX2-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX2-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: define void @gather_load_2( @@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-LABEL: define void @gather_load_div( ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 @@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-LABEL: define void @gather_load_div( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX2-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index cf380f04a6939..f651fa53c53c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: define void @gather_load_2( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX2-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX2-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: define void @gather_load_2( @@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-LABEL: define void @gather_load_div( ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 @@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-LABEL: define void @gather_load_div( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX2-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index f921278cdecf3..5bf3783034190 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -11,23 +11,62 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20 -; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> , <15 x double> poison) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1 +; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8 +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3 +; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8 +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5 +; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8 +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7 +; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8 +; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9 +; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8 +; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11 +; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8 +; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13 +; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8 +; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15 +; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call <24 x double> @llvm.masked.load.v24f64.p0(ptr [[ARG1:%.*]], i32 8, <24 x i1> , <24 x double> poison) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x double> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd fast <2 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP22]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul fast <2 x double> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x double> [[TMP20]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <2 x double> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = fadd fast <2 x double> [[TMP25]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = fmul fast <2 x double> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd fast <2 x double> [[TMP30]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP37]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]] +; CHECK-NEXT: [[I143:%.*]] = fadd fast <2 x double> [[TMP35]], [[TMP39]] ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 19ce11c457f63..bd24093218874 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,18 +5,15 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -66,18 +63,15 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -129,18 +123,15 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -190,18 +181,15 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll index 5aa4dba2b8a1b..4deddc138727a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll @@ -13,31 +13,31 @@ define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) { ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> , i64 [[TMP1]], i32 1 ; CHECK-NEXT: br label %[[BB16:.*]] ; CHECK: [[BB16]]: -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ] -; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[TMP25]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[_LOOPEXIT206:.*]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[_LOOPEXIT206]] ] ; CHECK-NEXT: switch i32 0, label %[[BB19:.*]] [ -; CHECK-NEXT: i32 0, label %[[TMP25]] +; CHECK-NEXT: i32 0, label %[[_LOOPEXIT206]] ; CHECK-NEXT: ] ; CHECK: [[BB19]]: -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 0, i32 1 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 0, i32 2 ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[TMP22]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> -; CHECK-NEXT: br label %[[TMP25]] -; CHECK: [[TMP25]]: +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> +; CHECK-NEXT: br label %[[_LOOPEXIT206]] +; CHECK: [[_LOOPEXIT206]]: ; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i64> [ [[TMP17]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] ; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i64> [ [[TMP23]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] ; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i64> [ [[TMP24]], %[[BB19]] ], [ [[TMP15]], %[[BB16]] ] -; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> , <2 x i32> +; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> , <2 x i32> ; CHECK-NEXT: br i1 false, label %[[DOTLOOPEXIT206:.*]], label %[[BB16]] -; CHECK: [[_LOOPEXIT206:.*:]] +; CHECK: [[_LOOPEXIT207:.*:]] ; CHECK-NEXT: switch i32 0, label %[[BB32:.*]] [ ; CHECK-NEXT: i32 0, [[DOTCONT174:label %.*]] ; CHECK-NEXT: i32 1, label %[[BB30:.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll index 972a58cecc822..ed0782fb5b84d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll @@ -6,31 +6,33 @@ define void @test(ptr %p) { ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ARRAYIDX7_US_I_841:%.*]] = getelementptr i8, ptr [[P]], i64 36 +; CHECK-NEXT: [[ARRAYIDX7_US_I_1051:%.*]] = getelementptr i8, ptr [[P]], i64 44 ; CHECK-NEXT: [[ARRAYIDX7_US_I_1261:%.*]] = getelementptr i8, ptr [[P]], i64 52 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX7_US_I_1051]], i64 -44, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> [[TMP20]], <12 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP9]], <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = srem <16 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = or <12 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = srem <12 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = srem <4 x i32> [[TMP16]], splat (i32 1) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = srem <8 x i32> [[TMP21]], ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I:.*]] ; CHECK: [[FOR_COND1_PREHEADER_US_I]]: ; CHECK-NEXT: [[A_PROMOTED253537_US_I:%.*]] = phi i32 [ [[OP_RDX8:%.*]], %[[FOR_COND1_PREHEADER_US_I]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> [[TMP16]]) +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RDX_OP:%.*]] = add <4 x i32> [[TMP22]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> [[TMP23]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP24]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[OP_RDX8]] = add i32 [[OP_RDX]], 0 ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I]]