diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 709640d54945b7..132e15cc2c5800 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3483,6 +3483,7 @@ enum class LoadsState { Gather, Vectorize, ScatterVectorize }; static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, + LoopInfo &LI, SmallVectorImpl &Order, SmallVectorImpl &PointerOps) { // Check that a vectorized load would load the same memory as a scalar @@ -3510,30 +3511,59 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, } Order.clear(); - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; + // Check the order of pointer operands or that all pointers are the same. + bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); + if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) { + if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front())) + return false; + auto *GEP = dyn_cast(P); + if (!GEP) + return false; + auto *GEP0 = cast(PointerOps.front()); + return GEP->getNumOperands() == 2 && + ((isConstant(GEP->getOperand(1)) && + isConstant(GEP0->getOperand(1))) || + getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)}) + .getOpcode()); + })) { + if (IsSorted) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + } + // TODO: need to improve analysis of the pointers, if not all of them are + // GEPs or have > 2 operands, we end up with a gather node, which just + // increases the cost. + Loop *L = LI.getLoopFor(cast(VL0)->getParent()); + bool ProfitableGatherPointers = + static_cast(count_if(PointerOps, [L](Value *V) { + return L && L->isLoopInvariant(V); + })) <= VL.size() / 2 && VL.size() > 2; + if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { + auto *GEP = dyn_cast(P); + return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || + (GEP && GEP->getNumOperands() == 2); + })) { + Align CommonAlignment = cast(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && + !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::ScatterVectorize; } - Optional Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) - return LoadsState::Vectorize; - Align CommonAlignment = cast(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast(V)->getAlign()); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && - !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::ScatterVectorize; } return LoadsState::Gather; @@ -4611,8 +4641,28 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } return true; }; - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || - (isa(S.MainOp) && + SmallVector SortedIndices; + BasicBlock *BB = nullptr; + bool AreAllSameInsts = + (S.getOpcode() && allSameBlock(VL)) || + (S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && + VL.size() > 2 && + all_of(VL, + [&BB](Value *V) { + auto *I = dyn_cast(V); + if (!I) + return doesNotNeedToBeScheduled(V); + if (!BB) + BB = I->getParent(); + return BB == I->getParent() && I->getNumOperands() == 2; + }) && + BB && + sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, + SortedIndices)); + if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts || + (isa( + S.OpValue) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); @@ -4683,10 +4733,25 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } } + // Special processing for sorted pointers for ScatterVectorize node with + // constant indeces only. + if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) && + UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) { + assert(S.OpValue->getType()->isPointerTy() && + count_if(VL, [](Value *V) { return isa(V); }) >= + 2 && + "Expected pointers only."); + // Reset S to make it GetElementPtr kind of node. + const auto *It = find_if(VL, [](Value *V) { return isa(V); }); + assert(It != VL.end() && "Expected at least one GEP."); + S = getSameOpcode(*It); + } + // Check that all of the users of the scalars that we want to vectorize are // schedulable. auto *VL0 = cast(S.OpValue); - BasicBlock *BB = VL0->getParent(); + BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with @@ -4875,7 +4940,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, SmallVector PointerOps; OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: if (CurrentOrder.empty()) { @@ -5056,7 +5121,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (Value *V : VL) { - if (cast(V)->getNumOperands() != 2) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (I->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -5069,7 +5137,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // different types. Type *Ty0 = cast(VL0)->getSourceElementType(); for (Value *V : VL) { - Type *CurTy = cast(V)->getSourceElementType(); + auto *GEP = dyn_cast(V); + if (!GEP) + continue; + Type *CurTy = GEP->getSourceElementType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); @@ -5080,15 +5151,22 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } } + bool IsScatterUser = + UserTreeIdx.UserTE && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; // We don't combine GEPs with non-constant indexes. Type *Ty1 = VL0->getOperand(1)->getType(); for (Value *V : VL) { - auto Op = cast(V)->getOperand(1); - if (!isa(Op) || + auto *I = dyn_cast(V); + if (!I) + continue; + auto *Op = I->getOperand(1); + if ((!IsScatterUser && !isa(Op)) || (Op->getType() != Ty1 && - Op->getType()->getScalarSizeInBits() > - DL->getIndexSizeInBits( - V->getType()->getPointerAddressSpace()))) { + ((IsScatterUser && !isa(Op)) || + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace())))) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -5103,9 +5181,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); SmallVector Operands(2); // Prepare the operand vector for pointer operands. - for (Value *V : VL) - Operands.front().push_back( - cast(V)->getPointerOperand()); + for (Value *V : VL) { + auto *GEP = dyn_cast(V); + if (!GEP) { + Operands.front().push_back(V); + continue; + } + Operands.front().push_back(GEP->getPointerOperand()); + } TE->setOperand(0, Operands.front()); // Need to cast all indices to the same type before vectorization to // avoid crash. @@ -5116,9 +5199,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); Type *Ty = all_of(VL, [VL0Ty, IndexIdx](Value *V) { - return VL0Ty == cast(V) - ->getOperand(IndexIdx) - ->getType(); + auto *GEP = dyn_cast(V); + if (!GEP) + return true; + return VL0Ty == GEP->getOperand(IndexIdx)->getType(); }) ? VL0Ty : DL->getIndexType(cast(VL0) @@ -5126,10 +5210,19 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ->getScalarType()); // Prepare the operand vector. for (Value *V : VL) { - auto *Op = cast(V)->getOperand(IndexIdx); - auto *CI = cast(Op); - Operands.back().push_back(ConstantExpr::getIntegerCast( - CI, Ty, CI->getValue().isSignBitSet())); + auto *I = dyn_cast(V); + if (!I) { + Operands.back().push_back( + ConstantInt::get(Ty, 0, /*isSigned=*/false)); + continue; + } + auto *Op = I->getOperand(IndexIdx); + auto *CI = dyn_cast(Op); + if (!CI) + Operands.back().push_back(Op); + else + Operands.back().push_back(ConstantExpr::getIntegerCast( + CI, Ty, CI->getValue().isSignBitSet())); } TE->setOperand(IndexIdx, Operands.back()); @@ -5848,8 +5941,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { SmallVector PointerOps; OrdersType CurrentOrder; - LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, - *SE, CurrentOrder, PointerOps); + LoadsState LS = + canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, + CurrentOrder, PointerOps); switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: @@ -5939,7 +6033,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); - assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert(E->getOpcode() && + ((allSameType(VL) && allSameBlock(VL)) || + (E->getOpcode() == Instruction::GetElementPtr && + E->getMainOp()->getType()->isPointerTy())) && + "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -6258,7 +6356,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + any_of(VL, + [](Value *V) { + return isa(V) && + !isConstant( + cast(V)->getOperand(1)); + }) + ? TargetTransformInfo::OK_AnyValue + : TargetTransformInfo::OK_UniformConstantValue; InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); @@ -7270,6 +7375,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { auto *Front = E->getMainOp(); auto *BB = Front->getParent(); assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { + if (E->getOpcode() == Instruction::GetElementPtr && + !isa(V)) + return true; auto *I = cast(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); @@ -7517,6 +7625,13 @@ class ShuffleInstructionBuilder { Value *BoUpSLP::vectorizeTree(ArrayRef VL) { const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL); + // Special processing for GEPs bundle, which may include non-gep values. + if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { + const auto *It = + find_if(VL, [](Value *V) { return isa(V); }); + if (It != VL.end()) + S = getSameOpcode(*It); + } if (S.getOpcode()) { if (TreeEntry *E = getTreeEntry(S.OpValue)) if (E->isSame(VL)) { @@ -8049,8 +8164,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); - if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + if (Instruction *I = dyn_cast(V)) { + SmallVector GEPs; + for (Value *V : E->Scalars) { + if (isa(V)) + GEPs.push_back(V); + } + V = propagateMetadata(I, GEPs); + } ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); @@ -8285,6 +8406,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { assert(E && "Invalid scalar"); assert(E->State != TreeEntry::NeedToGather && "Extracting from a gather list"); + // Non-instruction pointers are not deleted, just skip them. + if (E->getOpcode() == Instruction::GetElementPtr && + !isa(Scalar)) + continue; Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -8662,6 +8787,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (Entry->getOpcode() == Instruction::GetElementPtr && + !isa(Scalar)) + continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index f1cb7835c88db5..f0814c7728d74b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -1,27 +1,88 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { -; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: ret void +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512F-LABEL: @gather_load( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -254,65 +315,23 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP3]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP4]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP3]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP4]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +476,23 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], +; AVX512F-NEXT: [[TMP5:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], +; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -730,48 +707,26 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512VL-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 3925563bf1f700..174ecb4d839d1f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -1,27 +1,88 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { -; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: ret void +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512F-LABEL: @gather_load( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 +; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -254,65 +315,23 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP3]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP4]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP3]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP4]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +476,23 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], +; AVX512F-NEXT: [[TMP5:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], +; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -730,48 +707,26 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512VL-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; ; AVX512-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index 4f813ce7d5f323..be99fe4e555e04 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,93 +4,22 @@ define void @test(i32* noalias %p, i32* noalias %addr, i32* noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[IDX1:%.*]] = load i32, i32* [[ADDR:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[IDX1]] -; CHECK-NEXT: [[I:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 1 -; CHECK-NEXT: [[IDX2:%.*]] = load i32, i32* [[GEP2]], align 8 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX2]] -; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I1]], [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32*> poison, i32* [[ADDR:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i32*> [[TMP0]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE1]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i32 0 -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 2 -; CHECK-NEXT: [[IDX3:%.*]] = load i32, i32* [[GEP3]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX3]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 3 -; CHECK-NEXT: [[IDX4:%.*]] = load i32, i32* [[GEP4]], align 8 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX4]] -; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 1 -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 4 -; CHECK-NEXT: [[IDX5:%.*]] = load i32, i32* [[GEP5]], align 8 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX5]] -; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 5 -; CHECK-NEXT: [[IDX6:%.*]] = load i32, i32* [[GEP6]], align 8 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX6]] -; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 2 -; CHECK-NEXT: store i32 [[ADD14]], i32* [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 6 -; CHECK-NEXT: [[IDX7:%.*]] = load i32, i32* [[GEP7]], align 8 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX7]] -; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[GEP8:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 7 -; CHECK-NEXT: [[IDX8:%.*]] = load i32, i32* [[GEP8]], align 8 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX8]] -; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 3 -; CHECK-NEXT: store i32 [[ADD21]], i32* [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[GEP9:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 8 -; CHECK-NEXT: [[IDX9:%.*]] = load i32, i32* [[GEP9]], align 8 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX9]] -; CHECK-NEXT: [[I8:%.*]] = load i32, i32* [[ARRAYIDX25]], align 4 -; CHECK-NEXT: [[GEP10:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 9 -; CHECK-NEXT: [[IDX10:%.*]] = load i32, i32* [[GEP10]], align 8 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX10]] -; CHECK-NEXT: [[I9:%.*]] = load i32, i32* [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = add nsw i32 [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 4 -; CHECK-NEXT: store i32 [[ADD28]], i32* [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 10 -; CHECK-NEXT: [[IDX11:%.*]] = load i32, i32* [[GEP11]], align 8 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX11]] -; CHECK-NEXT: [[I10:%.*]] = load i32, i32* [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[GEP12:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 11 -; CHECK-NEXT: [[IDX12:%.*]] = load i32, i32* [[GEP12]], align 8 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX12]] -; CHECK-NEXT: [[I11:%.*]] = load i32, i32* [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = add nsw i32 [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 5 -; CHECK-NEXT: store i32 [[ADD35]], i32* [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[GEP13:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 12 -; CHECK-NEXT: [[IDX13:%.*]] = load i32, i32* [[GEP13]], align 8 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX13]] -; CHECK-NEXT: [[I12:%.*]] = load i32, i32* [[ARRAYIDX39]], align 4 -; CHECK-NEXT: [[GEP14:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 13 -; CHECK-NEXT: [[IDX14:%.*]] = load i32, i32* [[GEP14]], align 8 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX14]] -; CHECK-NEXT: [[I13:%.*]] = load i32, i32* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = add nsw i32 [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 6 -; CHECK-NEXT: store i32 [[ADD42]], i32* [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[GEP15:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 14 -; CHECK-NEXT: [[IDX15:%.*]] = load i32, i32* [[GEP15]], align 8 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX15]] -; CHECK-NEXT: [[I14:%.*]] = load i32, i32* [[ARRAYIDX46]], align 4 -; CHECK-NEXT: [[GEP16:%.*]] = getelementptr inbounds i32, i32* [[ADDR]], i32 15 -; CHECK-NEXT: [[IDX16:%.*]] = load i32, i32* [[GEP16]], align 8 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, i32* [[P]], i32 [[IDX16]] -; CHECK-NEXT: [[I15:%.*]] = load i32, i32* [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = add nsw i32 [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds i32, i32* [[S]], i32 7 -; CHECK-NEXT: store i32 [[ADD49]], i32* [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 8, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32*> poison, i32* [[P:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i32*> [[TMP4]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE2]], <8 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP5]], i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP1]], i32 8, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE2]], <8 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP8]], i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: ret void ; entry: