diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9db9acdcc6f9e..adae7caf5917c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9324,52 +9324,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } -/// Creates either vp_store or vp_scatter intrinsics calls to represent -/// predicated store/scatter. -static Instruction * -lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, - Value *StoredVal, bool IsScatter, Value *Mask, - Value *EVL, const Align &Alignment) { - CallInst *Call; - if (IsScatter) { - Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), - Intrinsic::vp_scatter, - {StoredVal, Addr, Mask, EVL}); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( - Instruction::Store, Type::getVoidTy(EVL->getContext()), - {StoredVal, Addr})); - } - Call->addParamAttr( - 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); - return Call; -} - -/// Creates either vp_load or vp_gather intrinsics calls to represent -/// predicated load/gather. -static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, - VectorType *DataTy, - Value *Addr, bool IsGather, - Value *Mask, Value *EVL, - const Align &Alignment) { - CallInst *Call; - if (IsGather) { - Call = - Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, - nullptr, "wide.masked.gather"); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( - Instruction::Load, DataTy, Addr, "vp.op.load")); - } - Call->addParamAttr( - 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); - return Call; -} - void VPWidenLoadRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); @@ -9391,48 +9345,62 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { Mask = Builder.CreateVectorReverse(Mask, "reverse"); } - // TODO: split this into several classes for better design. - if (State.EVL) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - assert(cast(State.EVL)->getOpcode() == - VPInstruction::ExplicitVectorLength && - "EVL must be VPInstruction::ExplicitVectorLength."); - Value *EVL = State.get(State.EVL, VPIteration(0, 0)); - // If EVL is not nullptr, then EVL must be a valid value set during plan - // creation, possibly default value = whole vector register length. EVL - // is created only if TTI prefers predicated vectorization, thus if EVL - // is not nullptr it also implies preference for predicated - // vectorization. - // FIXME: Support reverse loading after vp_reverse is added. - NewLI = lowerLoadUsingVectorIntrinsics( - Builder, DataTy, State.get(getAddr(), Part, !CreateGather), - CreateGather, Mask, EVL, Alignment); - } else if (CreateGather) { - Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask, - nullptr, "wide.masked.gather"); - State.addMetadata(NewLI, LI); + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); + if (CreateGather) { + NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, + "wide.masked.gather"); + } else if (Mask) { + NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); } else { - auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (Mask) - NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask, - PoisonValue::get(DataTy), - "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.addMetadata(NewLI, LI); - if (Reverse) - NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } - + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); State.set(this, NewLI, Part); } } +void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + // FIXME: Support reverse loading after vp_reverse is added. + assert(!isReverse() && "Reverse loads are not implemented yet."); + + auto *LI = cast(&Ingredient); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGather = !isConsecutive(); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + CallInst *NewLI; + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + Value *Addr = State.get(getAddr(), 0, !CreateGather); + Value *Mask = + getMask() ? State.get(getMask(), 0) + : Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + if (CreateGather) { + NewLI = + Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, + nullptr, "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + State.addMetadata(NewLI, LI); + State.set(this, NewLI, 0); +} + void VPWidenStoreRecipe::execute(VPTransformState &State) { auto *SI = cast(&Ingredient); @@ -9456,45 +9424,62 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { Value *StoredVal = State.get(StoredVPValue, Part); if (isReverse()) { - assert(!State.EVL && "reversing not yet implemented with EVL"); // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - // TODO: split this into several classes for better design. - if (State.EVL) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - assert(cast(State.EVL)->getOpcode() == - VPInstruction::ExplicitVectorLength && - "EVL must be VPInstruction::ExplicitVectorLength."); - Value *EVL = State.get(State.EVL, VPIteration(0, 0)); - // If EVL is not nullptr, then EVL must be a valid value set during plan - // creation, possibly default value = whole vector register length. EVL - // is created only if TTI prefers predicated vectorization, thus if EVL - // is not nullptr it also implies preference for predicated - // vectorization. - // FIXME: Support reverse store after vp_reverse is added. - NewSI = lowerStoreUsingVectorIntrinsics( - Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal, - CreateScatter, Mask, EVL, Alignment); - } else if (CreateScatter) { - Value *VectorGep = State.get(getAddr(), Part); - NewSI = - Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask); - } else { - auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); + if (CreateScatter) + NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); + else if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); State.addMetadata(NewSI, SI); } } +void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + // FIXME: Support reverse loading after vp_reverse is added. + assert(!isReverse() && "Reverse store are not implemented yet."); + + auto *SI = cast(&Ingredient); + + VPValue *StoredValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + CallInst *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, 0); + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + // FIXME: Support reverse store after vp_reverse is added. + Value *Mask = + getMask() ? State.get(getMask(), 0) + : Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *Addr = State.get(getAddr(), 0, !CreateScatter); + if (CreateScatter) { + NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVL}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } + NewSI->addParamAttr( + 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); + State.addMetadata(NewSI, SI); +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 334b10e2e5d09..c74329a0bcc4a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -242,15 +242,6 @@ struct VPTransformState { ElementCount VF; unsigned UF; - /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid - /// value set during plan transformation, possibly a default value = whole - /// vector register length. EVL is created only if TTI prefers predicated - /// vectorization, thus if EVL is not nullptr it also implies preference for - /// predicated vectorization. - /// TODO: this is a temporarily solution, the EVL must be explicitly used by - /// the recipes and must be removed here. - VPValue *EVL = nullptr; - /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. @@ -875,7 +866,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { return true; case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: + case VPRecipeBase::VPWidenStoreEVLSC: case VPRecipeBase::VPWidenStoreSC: // TODO: Widened stores don't define a value, but widened loads do. Split // the recipes to be able to make widened loads VPSingleDefRecipes. @@ -2318,11 +2311,15 @@ class VPWidenMemoryRecipe : public VPRecipeBase { } public: - VPWidenMemoryRecipe *clone() override = 0; + VPWidenMemoryRecipe *clone() override { + llvm_unreachable("cloning not supported"); + } static inline bool classof(const VPRecipeBase *R) { - return R->getVPDefID() == VPDef::VPWidenLoadSC || - R->getVPDefID() == VPDef::VPWidenStoreSC; + return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || + R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || + R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; } static inline bool classof(const VPUser *U) { @@ -2390,13 +2387,48 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - // Widened, consecutive loads operations only demand the first lane of // their address. return Op == getAddr() && isConsecutive(); } }; +/// A recipe for widening load operations with vector-predication intrinsics, +/// using the address to load from, the explicit vector length and an optional +/// mask. +struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { + VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(), + {L->getAddr(), EVL}, L->isConsecutive(), false, + L->getDebugLoc()), + VPValue(this, &getIngredient()) { + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) + + /// Return the EVL operand. + VPValue *getEVL() const { return getOperand(1); } + + /// Generate the wide load or gather. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // Widened loads only demand the first lane of EVL and consecutive loads + // only demand the first lane of their address. + return Op == getEVL() || (Op == getAddr() && isConsecutive()); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { @@ -2436,6 +2468,50 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { return Op == getAddr() && isConsecutive() && Op != getStoredValue(); } }; + +/// A recipe for widening store operations with vector-predication intrinsics, +/// using the value to store, the address to store to, the explicit vector +/// length and an optional mask. +struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { + VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(), + {S->getAddr(), S->getStoredValue(), EVL}, + S->isConsecutive(), false, S->getDebugLoc()) { + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) + + /// Return the address accessed by this recipe. + VPValue *getStoredValue() const { return getOperand(1); } + + /// Return the EVL operand. + VPValue *getEVL() const { return getOperand(2); } + + /// Generate the wide store or scatter. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + if (Op == getEVL()) { + assert(getStoredValue() != Op && "unexpected store of EVL"); + return true; + } + // Widened, consecutive memory operations only demand the first lane of + // their address, unless the same operand is also stored. That latter can + // happen with opaque pointers. + return Op == getAddr() && isConsecutive() && Op != getStoredValue(); + } +}; + /// Recipe to expand a SCEV expression. class VPExpandSCEVRecipe : public VPSingleDefRecipe { const SCEV *Expr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 130fb04f586e7..ad4ea648cd614 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert(isa(R) && + assert((isa(R) || isa(R)) && "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 78932643c81fa..9ec422ec002c8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -47,6 +47,7 @@ bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; + case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; case VPReplicateSC: @@ -63,6 +64,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: @@ -81,6 +83,7 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { + case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; case VPReplicateSC: @@ -90,6 +93,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBranchOnMaskSC: case VPPredInstPHISC: case VPScalarIVStepsSC: + case VPWidenStoreEVLSC: case VPWidenStoreSC: return false; case VPBlendSC: @@ -155,7 +159,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenLoadEVLSC: case VPWidenLoadSC: + case VPWidenStoreEVLSC: case VPWidenStoreSC: assert( cast(this)->getIngredient().mayHaveSideEffects() == @@ -411,8 +417,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); Value *AVL = State.Builder.CreateSub(TripCount, Index); Value *EVL = GetEVL(State, AVL); - assert(!State.EVL && "multiple EVL recipes"); - State.EVL = this; return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { @@ -1778,11 +1782,25 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } +void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = vp.load "; + printOperands(O, SlotTracker); +} + void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN store "; printOperands(O, SlotTracker); } + +void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN vp.store "; + printOperands(O, SlotTracker); +} #endif void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 901ecd10c69d8..007dc3f89b3fb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1203,43 +1203,52 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } -/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using -/// the given \p Idiom. -static void -replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom, - function_ref Cond = {}) { +/// Collect all VPValues representing a header mask through the (ICMP_ULE, +/// WideCanonicalIV, backedge-taken-count) pattern. +/// TODO: Introduce explicit recipe for header-mask instead of searching +/// for the header-mask pattern manually. +static SmallVector collectAllHeaderMasks(VPlan &Plan) { + SmallVector WideCanonicalIVs; auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), [](VPUser *U) { return isa(U); }); - if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end()) - return; - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); - // Walk users of WideCanonicalIV and replace all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with - // the given idiom VPValue. + assert(count_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa(U); }) <= + 1 && + "Must have at most one VPWideCanonicalIVRecipe"); + if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { + auto *WideCanonicalIV = + cast(*FoundWidenCanonicalIVUser); + WideCanonicalIVs.push_back(WideCanonicalIV); + } + + // Also include VPWidenIntOrFpInductionRecipes that represent a widened + // version of the canonical induction. + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *WidenOriginalIV = dyn_cast(&Phi); + if (WidenOriginalIV && WidenOriginalIV->isCanonical()) + WideCanonicalIVs.push_back(WidenOriginalIV); + } + + // Walk users of wide canonical IVs and collect to all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). + SmallVector HeaderMasks; VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (VPUser *U : SmallVector(WideCanonicalIV->users())) { - auto *CompareToReplace = dyn_cast(U); - if (!CompareToReplace || - CompareToReplace->getOpcode() != Instruction::ICmp || - CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || - CompareToReplace->getOperand(1) != BTC) - continue; + for (auto *Wide : WideCanonicalIVs) { + for (VPUser *U : SmallVector(Wide->users())) { + auto *HeaderMask = dyn_cast(U); + if (!HeaderMask || HeaderMask->getOpcode() != Instruction::ICmp || + HeaderMask->getPredicate() != CmpInst::ICMP_ULE || + HeaderMask->getOperand(1) != BTC) + continue; - assert(CompareToReplace->getOperand(0) == WideCanonicalIV && - "WidenCanonicalIV must be the first operand of the compare"); - if (Cond) { - CompareToReplace->replaceUsesWithIf(&Idiom, Cond); - if (!CompareToReplace->getNumUsers()) - CompareToReplace->eraseFromParent(); - } else { - CompareToReplace->replaceAllUsesWith(&Idiom); - CompareToReplace->eraseFromParent(); + assert(HeaderMask->getOperand(0) == Wide && + "WidenCanonicalIV must be the first operand of the compare"); + HeaderMasks.push_back(HeaderMask); } } - if (!WideCanonicalIV->getNumUsers()) - WideCanonicalIV->eraseFromParent(); + return HeaderMasks; } void VPlanTransforms::addActiveLaneMask( @@ -1271,7 +1280,8 @@ void VPlanTransforms::addActiveLaneMask( // Walk users of WideCanonicalIV and replace all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an // active-lane-mask. - replaceHeaderPredicateWith(Plan, *LaneMask); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) + HeaderMask->replaceAllUsesWith(LaneMask); } /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and @@ -1301,17 +1311,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); - // TODO: revisit this and try to remove the mask operand. - // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace - // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count), - // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask. - Value *TrueMask = - ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext()); - VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask); - replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) { - return isa(U); - }); - // Now create the ExplicitVectorLengthPhi recipe in the main loop. + // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); EVLPhi->insertAfter(CanonicalIVPHI); auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, @@ -1336,6 +1336,31 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + for (VPUser *U : collectUsersRecursively(HeaderMask)) { + auto *MemR = dyn_cast(U); + if (!MemR) + continue; + assert(!MemR->isReverse() && + "Reversed memory operations not supported yet."); + VPValue *OrigMask = MemR->getMask(); + assert(OrigMask && "Unmasked widen memory recipe when folding tail"); + VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; + if (auto *L = dyn_cast(MemR)) { + auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + N->insertBefore(L); + L->replaceAllUsesWith(N); + L->eraseFromParent(); + } else if (auto *S = dyn_cast(MemR)) { + auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + N->insertBefore(S); + S->eraseFromParent(); + } else { + llvm_unreachable("unsupported recipe"); + } + } + recursivelyDeleteDeadRecipes(HeaderMask); + } // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 0bbc7ffb4a2fe..96d04271850f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -356,7 +356,9 @@ class VPDef { VPWidenCanonicalIVSC, VPWidenCastSC, VPWidenGEPSC, + VPWidenLoadEVLSC, VPWidenLoadSC, + VPWidenStoreEVLSC, VPWidenStoreSC, VPWidenSC, VPWidenSelectSC, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll index 835ff37568817..ae01bdd371106 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll @@ -26,7 +26,6 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv2i64() @@ -36,9 +35,7 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP15]] -; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: @@ -47,17 +44,16 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true) -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP20]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP21]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP22]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP18]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 72b881bd44c76..8caa9368bfde1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -27,14 +27,14 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir +; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>