-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[SLPVectorizer] Widen constant strided loads. #162324
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Given a set of pointers, check if they can be rearranged as follows (%s is a constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 ... %b + 0 * %s + w %b + 1 * %s + 0 %b + 1 * %s + 1 %b + 1 * %s + 2 ... %b + 1 * %s + w ... If the pointers can be rearanged in the above pattern, it means that the memory can be accessed with a strided loads of width `w` and stride `%s`.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Mikhail Gudim (mgudim) ChangesPatch is 85.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162324.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 91c3d4270b241..722de48e81e8a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,29 @@ class BoUpSLP {
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+ Align Alignment, int64_t Diff, size_t Sz) const;
+ /// Given a set of pointers, check if they can be rearranged as follows (%s is
+ /// a constant):
+ /// %b + 0 * %s + 0
+ /// %b + 0 * %s + 1
+ /// %b + 0 * %s + 2
+ /// ...
+ /// %b + 0 * %s + w
+ ///
+ /// %b + 1 * %s + 0
+ /// %b + 1 * %s + 1
+ /// %b + 1 * %s + 2
+ /// ...
+ /// %b + 1 * %s + w
+ /// ...
+ ///
+ /// If the pointers can be rearanged in the above pattern, it means that the
+ /// memory can be accessed with a strided loads of width `w` and stride `%s`.
+ bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ int64_t Diff, Value *Ptr0, Value *PtrN,
+ StridedPtrInfo &SPtrInfo) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -6824,12 +6845,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const {
- const size_t Sz = PointerOps.size();
- if (Diff % (Sz - 1) != 0)
- return false;
-
+ Align Alignment, int64_t Diff, size_t Sz) const {
// Try to generate strided load node.
auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
@@ -6850,29 +6866,97 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
return false;
if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
return false;
+ }
+ return true;
+}
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices, int64_t Diff, Value *Ptr0,
+ Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+ const unsigned Sz = PointerOps.size();
+ SmallVector<int64_t> SortedOffsetsFromBase;
+ SortedOffsetsFromBase.resize(Sz);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *Ptr =
+ SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+ SortedOffsetsFromBase[I] =
+ *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+ }
+ assert(SortedOffsetsFromBase.size() > 1 &&
+ "Trying to generate strided load for less than 2 loads");
+ //
+ // Find where the first group ends.
+ int64_t StrideWithinGroup =
+ SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+ unsigned GroupSize = 1;
+ for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+ if (SortedOffsetsFromBase[GroupSize] -
+ SortedOffsetsFromBase[GroupSize - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ int64_t StrideIntVal = StrideWithinGroup;
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+ if (Sz != GroupSize) {
+ if (Sz % GroupSize != 0)
+ return false;
+ VecSz = Sz / GroupSize;
+
+ if (StrideWithinGroup != 1)
+ return false;
+ unsigned VecSz = Sz / GroupSize;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ GroupSize);
+ StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoadTy) ||
+ !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+ return false;
+
+ unsigned PrevGroupStartIdx = 0;
+ unsigned CurrentGroupStartIdx = GroupSize;
+ int64_t StrideBetweenGroups =
+ SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+ StrideIntVal = StrideBetweenGroups;
+ while (CurrentGroupStartIdx != Sz) {
+ if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+ SortedOffsetsFromBase[PrevGroupStartIdx] !=
+ StrideBetweenGroups)
break;
+ PrevGroupStartIdx = CurrentGroupStartIdx;
+ CurrentGroupStartIdx += GroupSize;
}
- if (Dists.size() == Sz) {
- Type *StrideTy = DL->getIndexType(Ptr0->getType());
- SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
- SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
- return true;
+ if (CurrentGroupStartIdx != Sz)
+ return false;
+
+ auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+ int64_t StrideWithinGroup) -> bool {
+ unsigned GroupEndIdx = StartIdx + 1;
+ for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+ if (SortedOffsetsFromBase[GroupEndIdx] -
+ SortedOffsetsFromBase[GroupEndIdx - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ return GroupEndIdx - StartIdx == GroupSize0;
+ };
+ for (unsigned I = 0; I < Sz; I += GroupSize) {
+ if (!CheckGroup(I, GroupSize, StrideWithinGroup))
+ return false;
}
}
- return false;
+
+ if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, VecSz))
+ return false;
+
+ Type *StrideTy = DL->getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+ SPtrInfo.Ty = StridedLoadTy;
+ return true;
}
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
@@ -6958,8 +7042,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
Align Alignment =
cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
->getAlign();
- if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
- SPtrInfo))
+ if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+ *Diff, Ptr0, PtrN, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -14867,11 +14951,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
break;
case TreeEntry::StridedVectorize: {
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+ FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
VecLdCost = TTI->getStridedMemoryOpCost(
- Instruction::Load, VecTy, LI0->getPointerOperand(),
+ Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind);
+ if (StridedLoadTy != VecTy)
+ VecLdCost +=
+ TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+ getCastContextHint(*E), CostKind);
+
break;
}
case TreeEntry::CompressVectorize: {
@@ -19635,6 +19727,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
? NewLI
: ::propagateMetadata(NewLI, E->Scalars);
+ if (StridedLoadTy != VecTy)
+ V = Builder.CreateBitOrPointerCast(V, VecTy);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 02e05b2e4138a..12725e7d46273 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s
define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
; CHECK-LABEL: define void @const_stride_1_no_reordering(
@@ -621,22 +621,10 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
-; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
-; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
-; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[GEP_S0]], align 1
; CHECK-NEXT: ret void
;
%gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c940353ba5a..60d3b291b5dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -5,18 +5,19 @@ define i16 @test(ptr %i) {
; CHECK-LABEL: define i16 @test(
; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 132860, i64 137774>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[GEP_US154:%.*]] = getelementptr i8, ptr [[I]], i64 132860
; CHECK-NEXT: [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688
; CHECK-NEXT: br label %[[FOR_COND5_US:.*]]
; CHECK: [[FOR_COND5_US]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0)
; CHECK-NEXT: ret i16 [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
index b99a1c2d83394..dbcaafa9e5a8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
@@ -15,13 +15,15 @@ define void @test() {
; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
-; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
-; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> <i32 poison, i32 0, i32 20, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19, i32 18>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18>
+; CHECK-NEXT: [[TMP7:%.*]] = call <88 x float> @llvm.masked.load.v88f32.p0(ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), i32 16, <88 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <88 x float> poison)
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 87>
+; CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 8 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), i64 336, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <4 x i32> <i32 poison, i32 87, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHEC...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Mikhail Gudim (mgudim) ChangesPatch is 85.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162324.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 91c3d4270b241..722de48e81e8a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,29 @@ class BoUpSLP {
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+ Align Alignment, int64_t Diff, size_t Sz) const;
+ /// Given a set of pointers, check if they can be rearranged as follows (%s is
+ /// a constant):
+ /// %b + 0 * %s + 0
+ /// %b + 0 * %s + 1
+ /// %b + 0 * %s + 2
+ /// ...
+ /// %b + 0 * %s + w
+ ///
+ /// %b + 1 * %s + 0
+ /// %b + 1 * %s + 1
+ /// %b + 1 * %s + 2
+ /// ...
+ /// %b + 1 * %s + w
+ /// ...
+ ///
+ /// If the pointers can be rearanged in the above pattern, it means that the
+ /// memory can be accessed with a strided loads of width `w` and stride `%s`.
+ bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+ Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices,
+ int64_t Diff, Value *Ptr0, Value *PtrN,
+ StridedPtrInfo &SPtrInfo) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -6824,12 +6845,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const {
- const size_t Sz = PointerOps.size();
- if (Diff % (Sz - 1) != 0)
- return false;
-
+ Align Alignment, int64_t Diff, size_t Sz) const {
// Try to generate strided load node.
auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
@@ -6850,29 +6866,97 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
return false;
if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
return false;
+ }
+ return true;
+}
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+ SmallVectorImpl<unsigned> &SortedIndices, int64_t Diff, Value *Ptr0,
+ Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+ const unsigned Sz = PointerOps.size();
+ SmallVector<int64_t> SortedOffsetsFromBase;
+ SortedOffsetsFromBase.resize(Sz);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *Ptr =
+ SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+ SortedOffsetsFromBase[I] =
+ *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+ }
+ assert(SortedOffsetsFromBase.size() > 1 &&
+ "Trying to generate strided load for less than 2 loads");
+ //
+ // Find where the first group ends.
+ int64_t StrideWithinGroup =
+ SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+ unsigned GroupSize = 1;
+ for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+ if (SortedOffsetsFromBase[GroupSize] -
+ SortedOffsetsFromBase[GroupSize - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ unsigned VecSz = Sz;
+ Type *ScalarTy = ElemTy;
+ int64_t StrideIntVal = StrideWithinGroup;
+ FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+ if (Sz != GroupSize) {
+ if (Sz % GroupSize != 0)
+ return false;
+ VecSz = Sz / GroupSize;
+
+ if (StrideWithinGroup != 1)
+ return false;
+ unsigned VecSz = Sz / GroupSize;
+ ScalarTy = Type::getIntNTy(SE->getContext(),
+ DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+ GroupSize);
+ StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+ if (!TTI->isTypeLegal(StridedLoadTy) ||
+ !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+ return false;
+
+ unsigned PrevGroupStartIdx = 0;
+ unsigned CurrentGroupStartIdx = GroupSize;
+ int64_t StrideBetweenGroups =
+ SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+ StrideIntVal = StrideBetweenGroups;
+ while (CurrentGroupStartIdx != Sz) {
+ if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+ SortedOffsetsFromBase[PrevGroupStartIdx] !=
+ StrideBetweenGroups)
break;
+ PrevGroupStartIdx = CurrentGroupStartIdx;
+ CurrentGroupStartIdx += GroupSize;
}
- if (Dists.size() == Sz) {
- Type *StrideTy = DL->getIndexType(Ptr0->getType());
- SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
- SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
- return true;
+ if (CurrentGroupStartIdx != Sz)
+ return false;
+
+ auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+ int64_t StrideWithinGroup) -> bool {
+ unsigned GroupEndIdx = StartIdx + 1;
+ for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+ if (SortedOffsetsFromBase[GroupEndIdx] -
+ SortedOffsetsFromBase[GroupEndIdx - 1] !=
+ StrideWithinGroup)
+ break;
+ }
+ return GroupEndIdx - StartIdx == GroupSize0;
+ };
+ for (unsigned I = 0; I < Sz; I += GroupSize) {
+ if (!CheckGroup(I, GroupSize, StrideWithinGroup))
+ return false;
}
}
- return false;
+
+ if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, VecSz))
+ return false;
+
+ Type *StrideTy = DL->getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+ SPtrInfo.Ty = StridedLoadTy;
+ return true;
}
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
@@ -6958,8 +7042,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
Align Alignment =
cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
->getAlign();
- if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
- SPtrInfo))
+ if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+ *Diff, Ptr0, PtrN, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -14867,11 +14951,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
break;
case TreeEntry::StridedVectorize: {
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+ FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
VecLdCost = TTI->getStridedMemoryOpCost(
- Instruction::Load, VecTy, LI0->getPointerOperand(),
+ Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind);
+ if (StridedLoadTy != VecTy)
+ VecLdCost +=
+ TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+ getCastContextHint(*E), CostKind);
+
break;
}
case TreeEntry::CompressVectorize: {
@@ -19635,6 +19727,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
? NewLI
: ::propagateMetadata(NewLI, E->Scalars);
+ if (StridedLoadTy != VecTy)
+ V = Builder.CreateBitOrPointerCast(V, VecTy);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 02e05b2e4138a..12725e7d46273 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s
define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
; CHECK-LABEL: define void @const_stride_1_no_reordering(
@@ -621,22 +621,10 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
-; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
-; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
-; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[GEP_S0]], align 1
; CHECK-NEXT: ret void
;
%gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c940353ba5a..60d3b291b5dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -5,18 +5,19 @@ define i16 @test(ptr %i) {
; CHECK-LABEL: define i16 @test(
; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 132860, i64 137774>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[GEP_US154:%.*]] = getelementptr i8, ptr [[I]], i64 132860
; CHECK-NEXT: [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688
; CHECK-NEXT: br label %[[FOR_COND5_US:.*]]
; CHECK: [[FOR_COND5_US]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0)
; CHECK-NEXT: ret i16 [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
index b99a1c2d83394..dbcaafa9e5a8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
@@ -15,13 +15,15 @@ define void @test() {
; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
-; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
-; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> <i32 poison, i32 0, i32 20, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19, i32 18>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18>
+; CHECK-NEXT: [[TMP7:%.*]] = call <88 x float> @llvm.masked.load.v88f32.p0(ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), i32 16, <88 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <88 x float> poison)
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 87>
+; CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 8 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), i64 336, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <4 x i32> <i32 poison, i32 87, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHEC...
[truncated]
|
No description provided.