[LV] Scalarize operands of predicated instructions

This patch attempts to scalarize the operand expressions of predicated instructions if they were conditionally executed in the original loop. After scalarization, the expressions will be sunk inside the blocks created for the predicated instructions. The transformation essentially performs un-if-conversion on the operands. The cost model has been updated to determine if scalarization is profitable. It compares the cost of a vectorized instruction, assuming it will be if-converted, to the cost of the scalarized instruction, assuming that the instructions corresponding to each vector lane will be sunk inside a predicated block, possibly avoiding execution. If it's more profitable to scalarize the entire expression tree feeding the predicated instruction, the expression will be scalarized; otherwise, it will be vectorized. We only consider the cost of the entire expression to accurately estimate the cost of the required insertelement and extractelement instructions. Differential Revision: https://reviews.llvm.org/D26083 llvm-svn: 288909
llvm · Dec 7, 2016 · 364da7e · 364da7e
1 parent 41552d6
commit 364da7e
Show file tree

Hide file tree

Showing 6 changed files with 539 additions and 14 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -518,6 +518,10 @@ class InnerLoopVectorizer {
   /// induction variable will first be truncated to the corresponding type.
   void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
+  /// Returns true if an instruction \p I should be scalarized instead of
+  /// vectorized for the chosen vectorization factor.
+  bool shouldScalarizeInstruction(Instruction *I) const;
+
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
@@ -1907,6 +1911,15 @@ class LoopVectorizationCostModel {
     return MinBWs;
   }
 
+  /// \returns True if it is more profitable to scalarize instruction \p I for
+  /// vectorization factor \p VF.
+  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+    auto Scalars = InstsToScalarize.find(VF);
+    assert(Scalars != InstsToScalarize.end() &&
+           "VF not yet analyzed for scalarization profitability");
+    return Scalars->second.count(I);
+  }
+
 private:
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1949,6 +1962,29 @@ class LoopVectorizationCostModel {
   /// to this type.
   MapVector<Instruction *, uint64_t> MinBWs;
 
+  /// A type representing the costs for instructions if they were to be
+  /// scalarized rather than vectorized. The entries are Instruction-Cost
+  /// pairs.
+  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+
+  /// A map holding scalar costs for different vectorization factors. The
+  /// presence of a cost for an instruction in the mapping indicates that the
+  /// instruction will be scalarized when vectorizing with the associated
+  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+  /// Returns the expected difference in cost from scalarizing the expression
+  /// feeding a predicated instruction \p PredInst. The instructions to
+  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+  /// non-negative return value implies the expression will be scalarized.
+  /// Currently, only single-use chains are considered for scalarization.
+  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+                              unsigned VF);
+
+  /// Collects the instructions to scalarize for each predicated instruction in
+  /// the loop.
+  void collectInstsToScalarize(unsigned VF);
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2183,12 +2219,17 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
   VecInd->addIncoming(LastInduction, LoopVectorLatch);
 }
 
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+  return Legal->isScalarAfterVectorization(I) ||
+         Cost->isProfitableToScalarize(I, VF);
+}
+
 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
-  if (Legal->isScalarAfterVectorization(IV))
+  if (shouldScalarizeInstruction(IV))
     return true;
   auto isScalarInst = [&](User *U) -> bool {
     auto *I = cast<Instruction>(U);
-    return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I));
+    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
   };
   return any_of(IV->users(), isScalarInst);
 }
@@ -2229,7 +2270,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !Legal->isScalarAfterVectorization(EntryVal)) {
+      !shouldScalarizeInstruction(EntryVal)) {
     createVectorIntInductionPHI(ID, EntryVal);
     VectorizedIV = true;
   }
@@ -4648,10 +4689,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       continue;
 
     // Scalarize instructions that should remain scalar after vectorization.
-    if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
+    if (VF > 1 &&
+        !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
           isa<DbgInfoIntrinsic>(&I)) &&
-        Legal->isScalarAfterVectorization(&I)) {
-      scalarizeInstruction(&I);
+        shouldScalarizeInstruction(&I)) {
+      scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
       continue;
     }
 
@@ -6124,6 +6166,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
+    collectInstsToScalarize(UserVF);
     return Factor;
   }
 
@@ -6530,10 +6573,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   return RUs;
 }
 
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+
+  // If we aren't vectorizing the loop, or if we've already collected the
+  // instructions to scalarize, there's nothing to do. Collection may already
+  // have occurred if we have a user-selected VF and are now computing the
+  // expected cost for interleaving.
+  if (VF < 2 || InstsToScalarize.count(VF))
+    return;
+
+  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+  // not profitable to scalarize any instructions, the presence of VF in the
+  // map will indicate that we've analyzed it already.
+  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+  // Find all the instructions that are scalar with predication in the loop and
+  // determine if it would be better to not if-convert the blocks they are in.
+  // If so, we also record the instructions to scalarize.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!Legal->blockNeedsPredication(BB))
+      continue;
+    for (Instruction &I : *BB)
+      if (Legal->isScalarWithPredication(&I)) {
+        ScalarCostsTy ScalarCosts;
+        if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+      }
+  }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+    unsigned VF) {
+
+  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+         "Instruction marked uniform-after-vectorization will be predicated");
+
+  // Initialize the discount to zero, meaning that the scalar version and the
+  // vector version cost the same.
+  int Discount = 0;
+
+  // Holds instructions to analyze. The instructions we visit are mapped in
+  // ScalarCosts. Those instructions are the ones that would be scalarized if
+  // we find that the scalar version costs less.
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Returns true if the given instruction can be scalarized.
+  auto canBeScalarized = [&](Instruction *I) -> bool {
+
+    // We only attempt to scalarize instructions forming a single-use chain
+    // from the original predicated block that would otherwise be vectorized.
+    // Although not strictly necessary, we give up on instructions we know will
+    // already be scalar to avoid traversing chains that are unlikely to be
+    // beneficial.
+    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+        Legal->isScalarAfterVectorization(I))
+      return false;
+
+    // If the instruction is scalar with predication, it will be analyzed
+    // separately. We ignore it within the context of PredInst.
+    if (Legal->isScalarWithPredication(I))
+      return false;
+
+    // If any of the instruction's operands are uniform after vectorization,
+    // the instruction cannot be scalarized. This prevents, for example, a
+    // masked load from being scalarized.
+    //
+    // We assume we will only emit a value for lane zero of an instruction
+    // marked uniform after vectorization, rather than VF identical values.
+    // Thus, if we scalarize an instruction that uses a uniform, we would
+    // create uses of values corresponding to the lanes we aren't emitting code
+    // for. This behavior can be changed by allowing getScalarValue to clone
+    // the lane zero values for uniforms rather than asserting.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get()))
+        if (Legal->isUniformAfterVectorization(J))
+          return false;
+
+    // Otherwise, we can scalarize the instruction.
+    return true;
+  };
+
+  // Returns true if an operand that cannot be scalarized must be extracted
+  // from a vector. We will account for this scalarization overhead below. Note
+  // that the non-void predicated instructions are placed in their own blocks,
+  // and their return values are inserted into vectors. Thus, an extract would
+  // still be required.
+  auto needsExtract = [&](Instruction *I) -> bool {
+    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+  };
+
+  // Compute the expected cost discount from scalarizing the entire expression
+  // feeding the predicated instruction. We currently only consider expressions
+  // that are single-use instruction chains.
+  Worklist.push_back(PredInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    // If we've already analyzed the instruction, there's nothing to do.
+    if (ScalarCosts.count(I))
+      continue;
+
+    // Compute the cost of the vector instruction. Note that this cost already
+    // includes the scalarization overhead of the predicated instruction.
+    unsigned VectorCost = getInstructionCost(I, VF).first;
+
+    // Compute the cost of the scalarized instruction. This cost is the cost of
+    // the instruction as if it wasn't if-converted and instead remained in the
+    // predicated block. We will scale this cost by block probability after
+    // computing the scalarization overhead.
+    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+    // Compute the scalarization overhead of needed insertelement instructions
+    // and phi nodes.
+    if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+      ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
+                                             false, TTI);
+      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+    }
+
+    // Compute the scalarization overhead of needed extractelement
+    // instructions. For each of the instruction's operands, if the operand can
+    // be scalarized, add it to the worklist; otherwise, account for the
+    // overhead.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get())) {
+        assert(VectorType::isValidElementType(J->getType()) &&
+               "Instruction has non-scalar type");
+        if (canBeScalarized(J))
+          Worklist.push_back(J);
+        else if (needsExtract(J))
+          ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
+                                                 false, true, TTI);
+      }
+
+    // Scale the total scalar cost by block probability.
+    ScalarCost /= getReciprocalPredBlockProb();
+
+    // Compute the discount. A non-negative discount means the vector version
+    // of the instruction costs more, and scalarizing would be beneficial.
+    Discount += VectorCost - ScalarCost;
+    ScalarCosts[I] = ScalarCost;
+  }
+
+  return Discount;
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect the instructions (and their associated costs) that will be more
+  // profitable to scalarize.
+  collectInstsToScalarize(VF);
+
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     VectorizationCostTy BlockCost;
@@ -6641,6 +6834,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   if (Legal->isUniformAfterVectorization(I))
     VF = 1;
 
+  if (VF > 1 && isProfitableToScalarize(I, VF))
+    return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
@@ -7007,7 +7203,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
 
-  // Insert values known to be scalar into VecValuesToIgnore.
+  // Insert values known to be scalar into VecValuesToIgnore. This is a
+  // conservative estimation of the values that will later be scalarized.
+  //
+  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
+  //        still be scalarized. For example, we may find an instruction to be
+  //        more profitable for a given vectorization factor if it were to be
+  //        scalarized. But at this point, we haven't yet computed the
+  //        vectorization factor.
   for (auto *BB : TheLoop->getBlocks())
     for (auto &I : *BB)
       if (Legal->isScalarAfterVectorization(&I))

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the scalarized operands for a
+; user-specified vectorization factor when interleaving is disabled. We use the
+; "optsize" attribute to disable all interleaving calculations.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK:   %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK:   %[[T02:.+]] = add nsw i64 %[[T01]], %x
+; CHECK:   %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
+; CHECK:   %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK:   %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK:   %[[T08:.+]] = add nsw i64 %[[T07]], %x
+; CHECK:   %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
+; CHECK:   %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp2 = load i64, i64* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i64 %tmp2, %x
+  %tmp4 = udiv i64 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i64 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, 100
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i64 [ %tmp6, %for.inc ]
+  ret i64 %tmp7
+}