diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b268c81e550cf..ac94d0dbcc4cd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7137,18 +7137,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, return nullptr; }; + // Check if a select for a safe divisor was hoisted to the pre-header. If so, + // the select doesn't need to be considered for the vector loop cost; go with + // the more accurate VPlan-based cost model. for (VPRecipeBase &R : *Plan.getVectorPreheader()) { - // Reverse operations for reverse memory accesses may be hoisted to the - // preheader by LICM if the reversed value is loop invariant. In this case, - // the VPlan-based cost model diverges from the legacy cost model. - if (match(&R, - m_CombineOr(m_Reverse(m_VPValue()), - m_Intrinsic()))) - return true; - - // Check if a select for a safe divisor was hoisted to the pre-header. If - // so, the select doesn't need to be considered for the vector loop cost; go - // with the more accurate VPlan-based cost model. auto *VPI = dyn_cast(&R); if (!VPI || VPI->getOpcode() != Instruction::Select) continue; @@ -7201,6 +7193,20 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) != CostCtx.isLegacyUniformAfterVectorization(AddrI, VF)) return true; + + if (WidenMemR->isReverse()) { + // If the stored value of a reverse store is invariant, LICM will + // hoist the reverse operation to the preheader. In this case, the + // result of the VPlan-based cost model will diverge from that of + // the legacy model. + if (auto *StoreR = dyn_cast(WidenMemR)) + if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions()) + return true; + + if (auto *StoreR = dyn_cast(WidenMemR)) + if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions()) + return true; + } } // The legacy cost model costs non-header phis with a scalar VF as a phi, @@ -7748,13 +7754,10 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, Ptr = VectorPtr; } - if (Reverse && Mask) - Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc()); - if (VPI->getOpcode() == Instruction::Load) { auto *Load = cast(I); - auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI, - Load->getDebugLoc()); + auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, + *VPI, Load->getDebugLoc()); if (Reverse) { Builder.insert(LoadR); return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {}, @@ -7768,8 +7771,8 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, if (Reverse) StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal, Store->getDebugLoc()); - return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI, - Store->getDebugLoc()); + return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, + Reverse, *VPI, Store->getDebugLoc()); } VPWidenIntOrFpInductionRecipe * diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5bac173262468..ab47d927942db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3535,6 +3535,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// Whether the accessed addresses are consecutive. bool Consecutive; + /// Whether the consecutive accessed addresses are in reverse order. + bool Reverse; + /// Whether the memory access is masked. bool IsMasked = false; @@ -3548,10 +3551,15 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list Operands, - bool Consecutive, const VPIRMetadata &Metadata, - DebugLoc DL) + bool Consecutive, bool Reverse, + const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive) {} + Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), + Reverse(Reverse) { + assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert((isa(getAddr()) || !Reverse) && + "Reversed acccess without VPVectorEndPointerRecipe address?"); + } public: VPWidenMemoryRecipe *clone() override { @@ -3573,6 +3581,10 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// Return whether the loaded-from / stored-to addresses are consecutive. bool isConsecutive() const { return Consecutive; } + /// Return whether the consecutive loaded/stored addresses are in reverse + /// order. + bool isReverse() const { return Reverse; } + /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); } @@ -3606,16 +3618,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPRecipeValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, const VPIRMetadata &Metadata, DebugLoc DL) + bool Consecutive, bool Reverse, + const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadSC, Load, {Addr}, - Consecutive, Metadata, DL), + Consecutive, Reverse, Metadata, DL), VPRecipeValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast(Ingredient), getAddr(), - getMask(), Consecutive, *this, getDebugLoc()); + getMask(), Consecutive, Reverse, *this, + getDebugLoc()); } VP_CLASSOF_IMPL(VPRecipeBase::VPWidenLoadSC); @@ -3648,7 +3662,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L, + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, L.getDebugLoc()), VPRecipeValue(this, &getIngredient()) { setMask(Mask); @@ -3687,17 +3701,18 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, /// to store to and an optional mask. struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, - VPValue *Mask, bool Consecutive, + VPValue *Mask, bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreSC, Store, - {Addr, StoredVal}, Consecutive, Metadata, DL) { + {Addr, StoredVal}, Consecutive, Reverse, Metadata, + DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - *this, getDebugLoc()); + Reverse, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPRecipeBase::VPWidenStoreSC); @@ -3732,8 +3747,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue *StoredVal, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreEVLSC, S.getIngredient(), - {Addr, StoredVal, &EVL}, S.isConsecutive(), S, - S.getDebugLoc()) { + {Addr, StoredVal, &EVL}, S.isConsecutive(), + S.isReverse(), S, S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a53ebd375ca2..7eefd77045050 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1029,6 +1029,8 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( return TTI::CastContextHint::Normal; if (!WidenMemoryRecipe->isConsecutive()) return TTI::CastContextHint::GatherScatter; + if (WidenMemoryRecipe->isReverse()) + return TTI::CastContextHint::Reversed; if (WidenMemoryRecipe->isMasked()) return TTI::CastContextHint::Masked; return TTI::CastContextHint::Normal; @@ -1036,7 +1038,6 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( VPValue *Operand = getOperand(0); TTI::CastContextHint CCH = TTI::CastContextHint::None; - bool IsReverse = false; // For Trunc/FPTrunc, get the context from the only user. if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * { @@ -1045,10 +1046,8 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( return dyn_cast(*R->user_begin()); }; if (VPRecipeBase *Recipe = GetOnlyUser(this)) { - if (match(Recipe, m_Reverse(m_VPValue()))) { + if (match(Recipe, m_Reverse(m_VPValue()))) Recipe = GetOnlyUser(cast(Recipe)); - IsReverse = true; - } if (Recipe) CCH = ComputeCCH(Recipe); } @@ -1058,16 +1057,12 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( Opcode == Instruction::FPExt) { if (auto *Recipe = Operand->getDefiningRecipe()) { VPValue *ReverseOp; - if (match(Recipe, m_Reverse(m_VPValue(ReverseOp)))) { + if (match(Recipe, m_Reverse(m_VPValue(ReverseOp)))) Recipe = ReverseOp->getDefiningRecipe(); - IsReverse = true; - } if (Recipe) CCH = ComputeCCH(Recipe); } } - if (IsReverse && CCH != TTI::CastContextHint::None) - CCH = TTI::CastContextHint::Reversed; auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand); Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy; @@ -1249,13 +1244,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::Reverse: { assert(VF.isVector() && "Reverse operation must be vector type"); - Type *EltTy = Ctx.Types.inferScalarType(this); - // Skip the reverse operation cost for the mask. - // FIXME: Remove this once redundant mask reverse operations can be - // eliminated by VPlanTransforms::cse before cost computation. - if (EltTy->isIntegerTy(1)) - return 0; - auto *VectorTy = cast(toVectorTy(EltTy, VF)); + auto *VectorTy = cast( + toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, VectorTy, /*Mask=*/{}, Ctx.CostKind, /*Index=*/0); @@ -1943,13 +1933,6 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) { - Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); - // Skip the reverse operation cost for the mask. - // FIXME: Remove this once redundant mask reverse operations can be eliminated - // by VPlanTransforms::cse before cost computation. - if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1)) - return InstructionCost(0); - // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise @@ -1969,6 +1952,7 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, Arguments.push_back(V); } + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector ParamTys; for (const VPValue *Op : Operands) { @@ -3817,27 +3801,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. - [[maybe_unused]] auto IsReverse = [this]() { - // Check if mask is reversed. - if (VPValue *Mask = getMask()) - if (match(Mask, m_Reverse(m_VPValue()))) - return true; - - // For loads, check if the single user is a reverse operation. - if (isa(this)) { - auto *U = getVPSingleValue()->getSingleUser(); - return U && match(cast(U), m_Reverse(m_VPValue())); - } + assert(!Reverse && + "Inconsecutive memory access should not have the order."); - // For stores, check if the stored value is reversed. - VPValue *StoredVal = - isa(this) - ? cast(this)->getStoredValue() - : cast(this)->getStoredValue(); - return match(StoredVal, m_Reverse(m_VPValue())); - }; - assert(!IsReverse() && - "Inconsecutive memory access should not have reverse order"); const Value *Ptr = getLoadStorePointerOperand(&Ingredient); Type *PtrTy = Ptr->getType(); @@ -3881,8 +3847,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; Value *Mask = nullptr; - if (auto *VPMask = getMask()) + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. Mask = State.get(VPMask); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); Value *NewLI; @@ -3910,6 +3881,17 @@ void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent, } #endif +/// Use all-true mask for reverse rather than actual mask, as it avoids a +/// dependence w/o affecting the result. +static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, + Value *EVL, const Twine &Name) { + VectorType *ValTy = cast(Operand->getType()); + Value *AllTrueMask = + Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); + return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, + {Operand, AllTrueMask, EVL}, nullptr, Name); +} + void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); @@ -3920,10 +3902,13 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Value *EVL = State.get(getEVL(), VPLane(0)); Value *Addr = State.get(getAddr(), !CreateGather); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) + if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); - else + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + } if (CreateGather) { NewLI = @@ -3975,8 +3960,13 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; Value *Mask = nullptr; - if (auto *VPMask = getMask()) + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. Mask = State.get(VPMask); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } Value *StoredVal = State.get(StoredVPValue); Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); @@ -4008,11 +3998,13 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) + if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); - else + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - + } Value *Addr = State.get(getAddr(), !CreateScatter); if (CreateScatter) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 78cc39522e8bc..d6e014e71e1f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -80,11 +80,12 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, *VPI, Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, *VPI, + Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), - nullptr /*Mask*/, false /*Consecutive*/, *VPI, + nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI, Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI, @@ -1784,6 +1785,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { auto *WidenStoreR = dyn_cast(&R); if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) && !WidenStoreR->isConsecutive()) { + assert(!WidenStoreR->isReverse() && + "Not consecutive memory recipes shouldn't be reversed"); VPValue *Mask = WidenStoreR->getMask(); // Only convert the scatter to a scalar store if it is unmasked. @@ -3057,32 +3060,20 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, return EVLEndPtr; }; - auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan, - DL](VPValue *V) -> VPWidenIntrinsicRecipe * { - if (!V) - return nullptr; - auto *Reverse = new VPWidenIntrinsicRecipe( - Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL}, - TypeInfo.inferScalarType(V), {}, {}, DL); - Reverse->insertBefore(&CurRecipe); - return Reverse; - }; - if (match(&CurRecipe, - m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask)))) + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast(CurRecipe).isReverse()) return new VPWidenLoadEVLRecipe(cast(CurRecipe), Addr, EVL, Mask); VPValue *ReversedVal; if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) && match(ReversedVal, - m_MaskedLoad(m_VPValue(EndPtr), - m_Reverse(m_RemoveMask(HeaderMask, Mask)))) && - match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) { - Mask = GetVPReverse(Mask); - Addr = AdjustEndPtr(EndPtr); + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast(ReversedVal)->isReverse()) { auto *LoadR = new VPWidenLoadEVLRecipe( - *cast(ReversedVal), Addr, EVL, Mask); + *cast(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask); LoadR->insertBefore(&CurRecipe); return new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL}, @@ -3091,19 +3082,24 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *StoredVal; if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal), - m_RemoveMask(HeaderMask, Mask)))) + m_RemoveMask(HeaderMask, Mask))) && + !cast(CurRecipe).isReverse()) return new VPWidenStoreEVLRecipe(cast(CurRecipe), Addr, StoredVal, EVL, Mask); if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)), - m_Reverse(m_RemoveMask(HeaderMask, Mask)))) && - match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) { - Mask = GetVPReverse(Mask); - Addr = AdjustEndPtr(EndPtr); - StoredVal = GetVPReverse(ReversedVal); - return new VPWidenStoreEVLRecipe(cast(CurRecipe), Addr, - StoredVal, EVL, Mask); + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast(CurRecipe).isReverse()) { + auto *NewReverse = new VPWidenIntrinsicRecipe( + Intrinsic::experimental_vp_reverse, + {ReversedVal, Plan->getTrue(), &EVL}, + TypeInfo.inferScalarType(ReversedVal), {}, {}, DL); + NewReverse->insertBefore(&CurRecipe); + return new VPWidenStoreEVLRecipe(cast(CurRecipe), + AdjustEndPtr(EndPtr), NewReverse, EVL, + Mask); } if (auto *Rdx = dyn_cast(&CurRecipe)) @@ -5372,9 +5368,9 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl &NarrowedOps) { // Narrow interleave group to wide load, as transformed VPlan will only // process one original iteration. auto *LI = cast(LoadGroup->getInterleaveGroup()->getInsertPos()); - auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(), - LoadGroup->getMask(), /*Consecutive=*/true, - {}, LoadGroup->getDebugLoc()); + auto *L = new VPWidenLoadRecipe( + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); NarrowedOps.insert(L); return L; @@ -5529,9 +5525,9 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps); auto *SI = cast(StoreGroup->getInterleaveGroup()->getInsertPos()); - auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr, - /*Consecutive=*/true, {}, - StoreGroup->getDebugLoc()); + auto *S = new VPWidenStoreRecipe( + *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, + /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll index 7c9854fe30b17..7f3b28422e47b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -23,7 +23,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 { ; CHECK: %[[REVERSE7:.*]] = call @llvm.vector.reverse.nxv4f64( %[[WIDEMSKLOAD]]) ; CHECK: %[[FADD:.*]] = fadd %[[REVERSE7]] ; CHECK: %[[REVERSE8:.*]] = call @llvm.vector.reverse.nxv4f64( %[[FADD]]) -; CHECK: call void @llvm.masked.store.nxv4f64.p0( %[[REVERSE8]], ptr align 8 %{{.*}}, %[[REVERSE6]]) +; CHECK: %[[REVERSE9:.*]] = call @llvm.vector.reverse.nxv4i1( %{{.*}}) +; CHECK: call void @llvm.masked.store.nxv4f64.p0( %[[REVERSE8]], ptr align 8 %{{.*}}, %[[REVERSE9]] entry: %cmp7 = icmp sgt i64 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index 39ae02fad4187..4a4928c637a5c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -46,8 +46,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison) +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]], <4 x double> poison) ; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], splat (double 1.000000e+00) ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], splat (double 1.000000e+00) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll index 1e7cbfca4b032..1c06b4b77e4a1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll @@ -29,11 +29,11 @@ define void @reverse_store(ptr %a, i64 %n) !dbg !3 { ; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[VEC_IND]], splat (i64 -1), !dbg [[DBG6:![0-9]+]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[TMP8]], !dbg [[DBG7:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( [[TMP7]], splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = sub nuw nsw i64 [[TMP5]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP12]], !dbg [[DBG8:![0-9]+]] -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( [[TMP7]], splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]] -; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP13]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP12]], !dbg [[DBG8]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP10]], ptr align 8 [[TMP13]], splat (i1 true), i32 [[TMP4]]), !dbg [[DBG8]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw [[VEC_IND]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll index 9c96f44a9d19b..116a87795fa0f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll @@ -17,12 +17,12 @@ define void @reverse_predicated_store(i1 %c, ptr %dst, i64 %n) #0 { ; CHECK-NEXT: [[IV:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[IV_NEXT:%.*]] = add i64 [[IV]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( zeroinitializer, splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = sub nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[ARRAYIDX]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( zeroinitializer, splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP12]], ptr align 4 [[TMP9]], [[VP_REVERSE_MASK]], i32 [[TMP1]]) ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP4]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 6fb1bf0d191e9..094500f07b418 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -37,8 +37,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP5]] ; RV64-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP14]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP5]] ; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] ; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] @@ -69,8 +69,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP4]] ; RV32-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP15]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP4]] ; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64 ; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]] @@ -196,8 +196,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[TMP29:%.*]] = add [[REVERSE]], splat (i32 1) ; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP18]] ; RV64-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP29]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP18]] ; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP35]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]] ; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]] @@ -250,8 +250,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP22:%.*]] = add [[REVERSE]], splat (i32 1) ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP23]], i32 [[TMP10]] ; RV32-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP22]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP23]], i32 [[TMP10]] ; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP25]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64 ; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]] @@ -421,8 +421,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[TMP29:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP18]] ; RV64-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP29]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP18]] ; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP35]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]] ; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]] @@ -475,8 +475,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP22:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP23]], i32 [[TMP10]] ; RV32-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP22]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP23]], i32 [[TMP10]] ; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP25]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64 ; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]] @@ -622,8 +622,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP5]] ; RV64-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP14]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP5]] ; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] ; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] @@ -654,8 +654,8 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP4]] ; RV32-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP15]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP4]] ; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64 ; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll index 2675b0cf11a29..f1542e403a870 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll @@ -27,8 +27,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP6]] ; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP6]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] @@ -127,17 +127,18 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP15:%.*]] = sub nuw nsw i64 [[TMP26]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP15]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP7]] +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP13]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP26]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]] ; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 @@ -179,7 +180,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP8]] ; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP14]] ; NO-VP-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[REVERSE2]]) -; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[REVERSE4]], ptr align 4 [[TMP22]], [[REVERSE]]) +; NO-VP-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i1( [[TMP10]]) +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[REVERSE4]], ptr align 4 [[TMP22]], [[REVERSE3]]) ; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; NO-VP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -263,8 +265,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP11]] ; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP11]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE1]], ptr align 1 [[TMP20]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP10]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP11]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll index b2e79f5033ee8..871dac6b9a78b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll @@ -23,11 +23,11 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]] ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = sub nuw nsw i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP7]] -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[REVERSE]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll index 6fd57e3dd2da2..dc0570110f606 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll @@ -37,8 +37,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: WIDEN-INTRINSIC vp<[[VAL_B:%.+]]> = call llvm.experimental.vp.reverse(ir<[[LOAD_B]]>, ir, vp<[[EVL]]>) ; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add vp<[[VAL_B]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]> -; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_A]]>, vp<[[EVL]]> ; CHECK-NEXT: WIDEN-INTRINSIC vp<[[STORE_VAL:%.+]]> = call llvm.experimental.vp.reverse(ir<[[ADD_RESULT]]>, ir, vp<[[EVL]]>) +; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_A]]>, vp<[[EVL]]> ; CHECK-NEXT: WIDEN vp.store vp<[[VEC_END_PTR_A]]>, vp<[[STORE_VAL]]>, vp<[[EVL]]> ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[EVL]]>, vp<[[EVL_PHI]]> ; CHECK-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[EVL]]> diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 672257bfda3f8..e2b713e868fa7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1139,12 +1139,12 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i64 -11 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i64 -15 ; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP15]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]] +; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP16]], <4 x i1> [[REVERSE13]], <4 x double> poison), !alias.scope [[META25]] +; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP17]], <4 x i1> [[REVERSE15]], <4 x double> poison), !alias.scope [[META25]] +; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP18]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]] ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD14]], <4 x double> poison, <4 x i32> @@ -1218,12 +1218,12 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i64 -23 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i64 -31 ; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP15]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]] +; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP16]], <8 x i1> [[REVERSE13]], <8 x double> poison), !alias.scope [[META37]] +; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP17]], <8 x i1> [[REVERSE15]], <8 x double> poison), !alias.scope [[META37]] +; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP18]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]] ; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD14]], <8 x double> poison, <8 x i32> diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 2bb3b5dc5b3e3..c28631a5a3ddb 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1200,7 +1200,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPRecipeBase) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); checkVPRecipeCastImpl(&Recipe); @@ -1232,7 +1232,7 @@ TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) { VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8)); - VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, {}, {}); + VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {}); VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask); checkVPRecipeCastImpl(&Recipe); @@ -1249,7 +1249,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) { VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, {}, {}); + VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {}); checkVPRecipeCastImpl(&Recipe); @@ -1266,7 +1266,8 @@ TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) { VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42)); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, {}, {}); + VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, false, {}, + {}); VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, StoredVal, *EVL, Mask); checkVPRecipeCastImpl(&Recipe); @@ -1352,7 +1353,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); - VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, {}, {}); + VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1366,7 +1367,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, {}, {}); + VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {}, + {}); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory());