[LV] Remove redundant IV casts using VPlan (NFCI).

This patch simplifies handling of redundant induction casts, by removing dead cast instructions after initial VPlan construction. This has the following benefits: 1. fixes a crash (see @test_optimized_cast_induction_feeding_first_order_recurrence) 2. Simplifies VPWidenIntOrFpInduction to a single-def recipes 3. Retires recordVectorLoopValueForInductionCast. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D115112
llvm · Dec 10, 2021 · 505ad03 · 505ad03
1 parent 2586c23
commit 505ad03
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 115 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -510,7 +510,7 @@ class InnerLoopVectorizer {
   /// the corresponding type.
   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
                              Value *Start, TruncInst *Trunc, VPValue *Def,
-                             VPValue *CastDef, VPTransformState &State);
+                             VPTransformState &State);
 
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
@@ -621,7 +621,7 @@ class InnerLoopVectorizer {
   /// can also be a truncate instruction.
   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
                         const InductionDescriptor &ID, VPValue *Def,
-                        VPValue *CastDef, VPTransformState &State);
+                        VPTransformState &State);
 
   /// Create a vector induction phi node based on an existing scalar one. \p
   /// EntryVal is the value from the original loop that maps to the vector phi
@@ -631,7 +631,6 @@ class InnerLoopVectorizer {
   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
                                        Value *Step, Value *Start,
                                        Instruction *EntryVal, VPValue *Def,
-                                       VPValue *CastDef,
                                        VPTransformState &State);
 
   /// Returns true if an instruction \p I should be scalarized instead of
@@ -641,29 +640,6 @@ class InnerLoopVectorizer {
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// If there is a cast involved in the induction variable \p ID, which should
-  /// be ignored in the vectorized loop body, this function records the
-  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
-  /// cast. We had already proved that the casted Phi is equal to the uncasted
-  /// Phi in the vectorized loop (under a runtime guard), and therefore
-  /// there is no need to vectorize the cast - the same value can be used in the
-  /// vector loop for both the Phi and the cast.
-  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
-  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
-  ///
-  /// \p EntryVal is the value from the original loop that maps to the vector
-  /// phi node and is used to distinguish what is the IV currently being
-  /// processed - original one (if \p EntryVal is a phi corresponding to the
-  /// original IV) or the "newly-created" one based on the proof mentioned above
-  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
-  /// latter case \p EntryVal is a TruncInst and we must not record anything for
-  /// that IV, but it's error-prone to expect callers of this routine to care
-  /// about that, hence this explicit parameter.
-  void recordVectorLoopValueForInductionCast(
-      const InductionDescriptor &ID, const Instruction *EntryVal,
-      Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
-      unsigned Part, unsigned Lane = UINT_MAX);
-
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -2358,8 +2334,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Value *Start,
-    Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
-    VPTransformState &State) {
+    Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");
 
@@ -2422,8 +2397,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
     if (isa<TruncInst>(EntryVal))
       addMetadata(LastInduction, EntryVal);
-    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
-                                          State, Part);
 
     LastInduction = cast<Instruction>(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
@@ -2457,42 +2430,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return llvm::any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
-    const InductionDescriptor &ID, const Instruction *EntryVal,
-    Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
-    unsigned Part, unsigned Lane) {
-  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // This induction variable is not the phi from the original loop but the
-  // newly-created IV based on the proof that casted Phi is equal to the
-  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
-  // re-uses the same InductionDescriptor that original IV uses but we don't
-  // have to do any recording in this case - that is done when original IV is
-  // processed.
-  if (isa<TruncInst>(EntryVal))
-    return;
-
-  if (!CastDef) {
-    assert(ID.getCastInsts().empty() &&
-           "there are casts for ID, but no CastDef");
-    return;
-  }
-  assert(!ID.getCastInsts().empty() &&
-         "there is a CastDef, but no casts for ID");
-  // Only the first Cast instruction in the Casts vector is of interest.
-  // The rest of the Casts (if exist) have no uses outside the
-  // induction update chain itself.
-  if (Lane < UINT_MAX)
-    State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
-  else
-    State.set(CastDef, VectorLoopVal, Part);
-}
-
 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
                                                 const InductionDescriptor &ID,
                                                 Value *Start, TruncInst *Trunc,
-                                                VPValue *Def, VPValue *CastDef,
+                                                VPValue *Def,
                                                 VPTransformState &State) {
   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
          "Primary induction variable must have an integer type");
@@ -2558,8 +2499,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
       State.set(Def, EntryPart, Part);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
-      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
-                                            State, Part);
     }
   };
 
@@ -2581,23 +2520,21 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = needsScalarInduction(EntryVal);
   if (!NeedsScalarIV) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
-                                    State);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
     return;
   }
 
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (!shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
-                                    State);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
     Value *ScalarIV = CreateScalarIV(Step);
     // Create scalar steps that can be used by instructions we will later
     // scalarize. Note that the addition of the scalar steps will not increase
     // the number of instructions in the loop in the common case prior to
     // InstCombine. We will be trading one vector extract for each scalar step.
-    buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
     return;
   }
 
@@ -2607,7 +2544,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
   Value *ScalarIV = CreateScalarIV(Step);
   if (!Cost->isScalarEpilogueAllowed())
     CreateSplatIV(ScalarIV, Step);
-  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
@@ -2661,7 +2598,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            Instruction *EntryVal,
                                            const InductionDescriptor &ID,
-                                           VPValue *Def, VPValue *CastDef,
+                                           VPValue *Def,
                                            VPTransformState &State) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
@@ -2711,8 +2648,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
       State.set(Def, Add, Part);
-      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
-                                            Part);
       // It's useful to record the lane values too for the known minimum number
       // of elements so we do those below. This improves the code quality when
       // trying to extract the first element, for example.
@@ -2732,8 +2667,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
       State.set(Def, Add, VPIteration(Part, Lane));
-      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
-                                            Part, Lane);
     }
   }
 }
@@ -8063,18 +7996,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
         }))
       DeadInstructions.insert(IndUpdate);
-
-    // We record as "Dead" also the type-casting instructions we had identified
-    // during induction analysis. We don't need any handling for them in the
-    // vectorized loop because we have proven that, under a proper runtime
-    // test guarding the vectorized loop, the value of the phi, and the casted
-    // value of the phi, are the same. The last instruction in this casting chain
-    // will get its scalar/vector/widened def from the scalar/vector/widened def
-    // of the respective phi node. Any other casts in the induction def-use chain
-    // have no other uses outside the phi update chain, and will be ignored.
-    const InductionDescriptor &IndDes = Induction.second;
-    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
-    DeadInstructions.insert(Casts.begin(), Casts.end());
   }
 }
 
@@ -8593,9 +8514,7 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
     assert(II->getStartValue() ==
            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-    const SmallVectorImpl<Instruction *> &Casts = II->getCastInsts();
-    return new VPWidenIntOrFpInductionRecipe(
-        Phi, Operands[0], *II, Casts.empty() ? nullptr : Casts.front());
+    return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
   }
 
   return nullptr;
@@ -9235,6 +9154,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
   cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
 
+  VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
   // Now that sink-after is done, move induction recipes for optimized truncates
   // to the phi section of the header block.
   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
@@ -9715,9 +9636,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  State.ILV->widenIntOrFpInduction(
-      IV, getInductionDescriptor(), getStartValue()->getLiveInIRValue(),
-      getTruncInst(), getVPValue(0), getCastValue(), State);
+  State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
+                                   getStartValue()->getLiveInIRValue(),
+                                   getTruncInst(), getVPValue(0), State);
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1004,29 +1004,21 @@ class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
   PHINode *IV;
   const InductionDescriptor &IndDesc;
 
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
-                                const InductionDescriptor &IndDesc,
-                                Instruction *Cast = nullptr)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
-        IndDesc(IndDesc) {
-    new VPValue(IV, this);
-
-    if (Cast)
-      new VPValue(Cast, this);
-  }
+                                const InductionDescriptor &IndDesc)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
+        IV(IV), IndDesc(IndDesc) {}
 
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
                                 const InductionDescriptor &IndDesc,
                                 TruncInst *Trunc)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
-        IndDesc(IndDesc) {
-    new VPValue(Trunc, this);
-  }
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
+        IV(IV), IndDesc(IndDesc) {}
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
@@ -1048,13 +1040,6 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
   /// Returns the start value of the induction.
   VPValue *getStartValue() { return getOperand(0); }
 
-  /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
-  VPValue *getCastValue() {
-    if (getNumDefinedValues() != 2)
-      return nullptr;
-    return getVPValue(1);
-  }
-
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -294,3 +294,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
     delete ToDelete;
   return Changed;
 }
+
+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
+  SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
+  for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+    if (!IV || IV->getTruncInst())
+      continue;
+
+    // Visit all casts connected to IV and in Casts. Collect them.
+    // remember them for removal.
+    auto &Casts = IV->getInductionDescriptor().getCastInsts();
+    VPValue *FindMyCast = IV;
+    for (Instruction *IRCast : reverse(Casts)) {
+      VPRecipeBase *FoundUserCast = nullptr;
+      for (auto *U : FindMyCast->users()) {
+        auto *UserCast = cast<VPRecipeBase>(U);
+        if (UserCast->getNumDefinedValues() == 1 &&
+            UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+          FoundUserCast = UserCast;
+          break;
+        }
+      }
+      assert(FoundUserCast && "Missing a cast to remove");
+      CastsToRemove.emplace_back(FoundUserCast, IV);
+      FindMyCast = FoundUserCast->getVPSingleValue();
+    }
+  }
+  for (auto &E : CastsToRemove) {
+    E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
+    E.first->eraseFromParent();
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -37,6 +37,14 @@ struct VPlanTransforms {
   static bool sinkScalarOperands(VPlan &Plan);
 
   static bool mergeReplicateRegions(VPlan &Plan);
+
+  /// Remove redundant casts of inductions.
+  ///
+  /// Such redundant casts are casts of induction variables that can be ignored,
+  /// because we already proved that the casted phi is equal to the uncasted phi
+  /// in the vectorized loop. There is no need to vectorize the cast - the same
+  /// value can be used for both the phi and casts in the vector loop.
+  static void removeRedundantInductionCasts(VPlan &Plan);
 };
 
 } // namespace llvm

diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -933,3 +933,41 @@ loop:
 exit:
   ret void
 }
+
+; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV
+; predicates generated for the vector loop. They should be removed in the
+; vector loop.
+define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) {
+; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    [[MAIN_IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[MAIN_IV_NEXT:%.+]], %vector.body ]
+; CHECK-NEXT:    [[VEC_RECUR:%.+]] = phi <2 x i32> [ <i32 poison, i32 0>, %vector.ph ], [ [[VEC_IV:%.+]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IV]] = phi <2 x i32> [ %induction, %vector.ph ], [ [[VEC_IV_NEXT:%.+]], %vector.body ]
+; CHECK-NEXT:    [[MAIN_IV_0:%.+]] = add i64 [[MAIN_IV]], 0
+; CHECK-NEXT:    [[RECUR_SHUFFLE:%.+]] = shufflevector <2 x i32> [[VEC_RECUR]], <2 x i32> [[VEC_IV]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[GEP0:%.+]] = getelementptr inbounds i32, i32* %ptr, i64 [[MAIN_IV_0]]
+; CHECK-NEXT:    [[GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; CHECK-NEXT:    [[GEP_CAST:%.+]] = bitcast i32* [[GEP1]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[RECUR_SHUFFLE]], <2 x i32>* [[GEP_CAST]], align 4
+; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add nuw i64 [[MAIN_IV]], 2
+; CHECK-NEXT:    [[VEC_IV_NEXT]] = add <2 x i32> [[VEC_IV]],
+;
+entry:
+  br label %loop
+
+loop:
+  %for = phi i32 [ 0, %entry ], [ %iv.2.conv, %loop ]
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.2.ext = shl i32 %iv.2, 24
+  %iv.2.conv = ashr exact i32 %iv.2.ext, 24
+  %gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1
+  store i32 %for, i32* %gep, align 4
+  %iv.2.next = add nsw i32 %iv.2.conv, %step
+  %iv.1.next = add nuw nsw i64 %iv.1, 1
+  %exitcond = icmp eq i64 %iv.1.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}