diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 373ea751d568a..e5deac7975728 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9127,7 +9127,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { "Unexpected type."); auto *IVR = getParent()->getPlan()->getCanonicalIV(); - PHINode *CanonicalIV = cast(State.get(IVR, 0)); + PHINode *CanonicalIV = cast(State.get(IVR, 0, /*IsScalar*/ true)); if (onlyScalarsGenerated(State.VF.isScalable())) { // This is the normalized GEP that starts counting at zero. @@ -9243,7 +9243,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); - Value *PrevInChain = State.get(getChainOp(), 0); + Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); RecurKind Kind = RdxDesc.getRecurrenceKind(); bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); // Propagate the fast-math flags carried by the underlying instruction. @@ -9252,8 +9252,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) - : State.get(Cond, {Part, 0}); + Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); VectorType *VecTy = dyn_cast(NewVecOp->getType()); Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, @@ -9278,7 +9277,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { NewVecOp); PrevInChain = NewRed; } else { - PrevInChain = State.get(getChainOp(), Part); + PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { @@ -9289,7 +9288,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { else NextInChain = State.Builder.CreateBinOp( (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); - State.set(this, NextInChain, Part); + State.set(this, NextInChain, Part, /*IsScalar*/ true); } } @@ -9404,7 +9403,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = State.get(getAddr(), Part); + auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); @@ -9428,7 +9427,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { nullptr, "wide.masked.gather"); State.addMetadata(NewLI, LI); } else { - auto *VecPtr = State.get(getAddr(), Part); + auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4ffbed4b705ca..4aeab6fc61998 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -242,7 +242,16 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { return Extract; } -Value *VPTransformState::get(VPValue *Def, unsigned Part) { +Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) { + if (NeedsScalar) { + assert((VF.isScalar() || Def->isLiveIn() || + (hasScalarValue(Def, VPIteration(Part, 0)) && + Data.PerPartScalars[Def][Part].size() == 1)) && + "Trying to access a single scalar per part but has multiple scalars " + "per part."); + return get(Def, VPIteration(Part, 0)); + } + // If Values have been set for this Def return the one relevant for \p Part. if (hasVectorValue(Def, Part)) return Data.PerPartOutput[Def][Part]; @@ -789,21 +798,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, auto *TCMO = Builder.CreateSub(TripCountV, ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); - auto VF = State.VF; - Value *VTCMO = - VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(BackedgeTakenCount, VTCMO, Part); + BackedgeTakenCount->setUnderlyingValue(TCMO); } - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(&VectorTripCount, VectorTripCountV, Part); + VectorTripCount.setUnderlyingValue(VectorTripCountV); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - State.set(&VFxUF, - createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF), - 0); + VFxUF.setUnderlyingValue( + createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF)); // When vectorizing the epilogue loop, the canonical induction start value // needs to be changed from zero to the value after the main vector loop. @@ -884,12 +887,16 @@ void VPlan::execute(VPTransformState *State) { isa(PhiR) || (isa(PhiR) && cast(PhiR)->isOrdered()); + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *Phi = State->get(PhiR, Part); - Value *Val = State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part); + Value *Phi = State->get(PhiR, Part, NeedsScalar); + Value *Val = + State->get(PhiR->getBackedgeValue(), + SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar); cast(Phi)->addIncoming(Val, VectorLatchBB); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index aaddaaff2aba2..47cebecdb2708 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -259,9 +259,10 @@ struct VPTransformState { DenseMap PerPartScalars; } Data; - /// Get the generated Value for the given VPValue \p Def and the given \p Part. - /// \see set. - Value *get(VPValue *Def, unsigned Part); + /// Get the generated vector Value for a given VPValue \p Def and a given \p + /// Part if \p IsScalar is false, otherwise return the generated scalar + /// for \p Part. \See set. + Value *get(VPValue *Def, unsigned Part, bool IsScalar = false); /// Get the generated Value for a given VPValue and given Part and Lane. Value *get(VPValue *Def, const VPIteration &Instance); @@ -282,14 +283,22 @@ struct VPTransformState { I->second[Instance.Part][CacheIdx]; } - /// Set the generated Value for a given VPValue and a given Part. - void set(VPValue *Def, Value *V, unsigned Part) { + /// Set the generated vector Value for a given VPValue and a given Part, if \p + /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0). + void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) { + if (IsScalar) { + set(Def, V, VPIteration(Part, 0)); + return; + } + assert((VF.isScalar() || V->getType()->isVectorTy()) && + "scalar values must be stored as (Part, 0)"); if (!Data.PerPartOutput.count(Def)) { DataState::PerPartValuesTy Entry(UF); Data.PerPartOutput[Def] = Entry; } Data.PerPartOutput[Def][Part] = V; } + /// Reset an existing vector value for \p Def and a given \p Part. void reset(VPValue *Def, Value *V, unsigned Part) { auto Iter = Data.PerPartOutput.find(Def); @@ -1376,6 +1385,13 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { /// Returns the result type of the cast. Type *getResultType() const { return ResultTy; } + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + // At the moment, only uniform codegen is implemented. + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// A recipe for widening Call instructions. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2d2f6acf913f1..27b72575ddd51 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -279,11 +279,12 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.SetCurrentDebugLocation(getDebugLoc()); if (Instruction::isBinaryOp(getOpcode())) { + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); if (Part != 0 && vputils::onlyFirstPartUsed(this)) - return State.get(this, 0); + return State.get(this, 0, OnlyFirstLaneUsed); - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); + Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); + Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast(Res)) @@ -385,8 +386,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, if (Part != 0) return nullptr; // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); + Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); + Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. @@ -407,7 +408,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, } case VPInstruction::ComputeReductionResult: { if (Part != 0) - return State.get(this, 0); + return State.get(this, 0, /*IsScalar*/ true); // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. @@ -424,7 +425,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Type *PhiTy = OrigPhi->getType(); VectorParts RdxParts(State.UF); for (unsigned Part = 0; Part < State.UF; ++Part) - RdxParts[Part] = State.get(LoopExitingDef, Part); + RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the @@ -512,9 +513,15 @@ void VPInstruction::execute(VPTransformState &State) { if (!hasResult()) continue; assert(GeneratedValue && "generateInstruction must produce a value"); - State.set(this, GeneratedValue, Part); + + bool IsVector = GeneratedValue->getType()->isVectorTy(); + State.set(this, GeneratedValue, Part, !IsVector); + assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult || + State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) && + "scalar value but not only first lane used"); } } + bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) @@ -530,8 +537,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: - // TODO: Cover additional operands. - return getOperand(0) == Op; + return true; }; llvm_unreachable("switch should return"); } @@ -1344,7 +1350,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); } - State.set(this, PartPtr, Part); + State.set(this, PartPtr, Part, /*IsScalar*/ true); } } @@ -1640,7 +1646,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { EntryPart->addIncoming(Start, VectorPH); EntryPart->setDebugLoc(getDebugLoc()); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, EntryPart, Part); + State.set(this, EntryPart, Part, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1711,7 +1717,7 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), 0); + Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; @@ -1801,7 +1807,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, EntryPart, Part); + State.set(this, EntryPart, Part, IsInLoop); } BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); @@ -1833,7 +1839,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = State.get(this, Part); + Value *EntryPart = State.get(this, Part, IsInLoop); // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? StartV : Iden; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index b114716b7c13d..1d2c17e91b7ab 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -73,12 +73,6 @@ class VPValue { // for multiple underlying IRs (Polly?) by providing a new VPlan front-end, // back-end and analysis information for the new IR. - // Set \p Val as the underlying Value of this VPValue. - void setUnderlyingValue(Value *Val) { - assert(!UnderlyingVal && "Underlying Value is already set."); - UnderlyingVal = Val; - } - public: /// Return the underlying Value attached to this VPValue. Value *getUnderlyingValue() { return UnderlyingVal; } @@ -192,6 +186,12 @@ class VPValue { /// is a live-in value. /// TODO: Also handle recipes defined in pre-header blocks. bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); } + + // Set \p Val as the underlying Value of this VPValue. + void setUnderlyingValue(Value *Val) { + assert(!UnderlyingVal && "Underlying Value is already set."); + UnderlyingVal = Val; + } }; typedef DenseMap Value2VPValueTy; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index 94f24fea3609c..13fc0eaafb808 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -117,10 +117,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_NO_LANEMASK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; DATA_NO_LANEMASK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 -; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector [[BROADCAST_SPLATINSERT4]], poison, zeroinitializer ; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index be83329d30fef..51d2648205030 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -142,7 +142,7 @@ define void @example2(i32 %n, i32 %x) optsize { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY19:%.*]] -; CHECK: vector.body19: +; CHECK: vector.body17: ; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0