[ScopBuilder] Introduce -polly-stmt-granularity=scalar-indep option.

The option splits BasicBlocks into minimal statements such that no additional scalar dependencies are introduced. The algorithm is based on a union-find structure, and unites sets if putting them into separate statements would introduce a scalar dependencies. As a consequence, instructions may be split into separate statements such their relative order is different than the statements they are in. This is accounted for instructions whose relative order matters (e.g. memory accesses). The algorithm is generic in that heuristic changes can be made relatively easily. We might relax the order requirement for read-reads or accesses to different base pointers. Forwardable instructions can be made to not cause a join. This implementation gives us a speed-up of 82% in SPEC 2006 456.hmmer benchmark by allowing loop-distribution in a hot loop such that one of the loops can be vectorized. Differential Revision: https://reviews.llvm.org/D38403 llvm-svn: 314983
llvm · Oct 5, 2017 · cc345e6 · cc345e6
1 parent fc50004
commit cc345e6
Show file tree

Hide file tree

Showing 7 changed files with 517 additions and 2 deletions.
diff --git a/polly/include/polly/ScopBuilder.h b/polly/include/polly/ScopBuilder.h
@@ -235,6 +235,12 @@ class ScopBuilder {
   /// separator is found.
   void buildSequentialBlockStmts(BasicBlock *BB);
 
+  /// Create one or more ScopStmts for @p BB using equivalence classes.
+  ///
+  /// Instructions of a basic block that belong to the same equivalence class
+  /// are added to the same statement.
+  void buildEqivClassBlockStmts(BasicBlock *BB);
+
   /// Create ScopStmt for all BBs and non-affine subregions of @p SR.
   ///
   /// @param SR A subregion of @p R.

diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -102,14 +103,16 @@ static cl::opt<bool> DisableMultiplicativeReductions(
     cl::desc("Disable multiplicative reductions"), cl::Hidden, cl::ZeroOrMore,
     cl::init(false), cl::cat(PollyCategory));
 
-enum class GranularityChoice { BasicBlocks };
+enum class GranularityChoice { BasicBlocks, ScalarIndepependence };
 
 static cl::opt<GranularityChoice> StmtGranularity(
     "polly-stmt-granularity",
     cl::desc(
         "Algorithm to use for splitting basic blocks into multiple statements"),
     cl::values(clEnumValN(GranularityChoice::BasicBlocks, "bb",
-                          "One statement per basic block")),
+                          "One statement per basic block"),
+               clEnumValN(GranularityChoice::ScalarIndepependence,
+                          "scalar-indep", "Scalar independence heuristic")),
     cl::init(GranularityChoice::BasicBlocks), cl::cat(PollyCategory));
 
 void ScopBuilder::buildPHIAccesses(ScopStmt *PHIStmt, PHINode *PHI,
@@ -701,6 +704,178 @@ void ScopBuilder::buildSequentialBlockStmts(BasicBlock *BB) {
   scop->addScopStmt(BB, SurroundingLoop, Instructions, Count);
 }
 
+/// Is @p Inst an ordered instruction?
+///
+/// An unordered instruction is an instruction, such that a sequence of
+/// unordered instructions can be permuted without changing semantics. Any
+/// instruction for which this is not always the case is ordered.
+static bool isOrderedInstruction(Instruction *Inst) {
+  return Inst->mayHaveSideEffects() || Inst->mayReadOrWriteMemory();
+}
+
+/// Join instructions to the same statement if one uses the scalar result of the
+/// other.
+static void joinOperandTree(EquivalenceClasses<Instruction *> &UnionFind,
+                            ArrayRef<Instruction *> ModeledInsts) {
+  for (Instruction *Inst : ModeledInsts) {
+    if (isa<PHINode>(Inst))
+      continue;
+
+    for (Use &Op : Inst->operands()) {
+      Instruction *OpInst = dyn_cast<Instruction>(Op.get());
+      if (!OpInst)
+        continue;
+
+      // Check if OpInst is in the BB and is a modeled instruction.
+      auto OpVal = UnionFind.findValue(OpInst);
+      if (OpVal == UnionFind.end())
+        continue;
+
+      UnionFind.unionSets(Inst, OpInst);
+    }
+  }
+}
+
+/// Join instructions that are used as incoming value in successor PHIs into the
+/// epilogue.
+static void
+joinIncomingPHIValuesIntoEpilogue(EquivalenceClasses<Instruction *> &UnionFind,
+                                  ArrayRef<Instruction *> ModeledInsts,
+                                  BasicBlock *BB) {
+  for (BasicBlock *Succ : successors(BB)) {
+    for (Instruction &SuccInst : *Succ) {
+      PHINode *SuccPHI = dyn_cast<PHINode>(&SuccInst);
+      if (!SuccPHI)
+        break;
+
+      Value *IncomingVal = SuccPHI->getIncomingValueForBlock(BB);
+      Instruction *IncomingInst = dyn_cast<Instruction>(IncomingVal);
+      if (!IncomingInst)
+        continue;
+      if (IncomingInst->getParent() != BB)
+        continue;
+      if (UnionFind.findValue(IncomingInst) == UnionFind.end())
+        continue;
+
+      UnionFind.unionSets(nullptr, IncomingInst);
+    }
+  }
+}
+
+/// Ensure that the order of ordered instructions does not change.
+///
+/// If we encounter an ordered instruction enclosed in instructions belonging to
+/// a different statement (which might as well contain ordered instructions, but
+/// this is not tested here), join them.
+static void
+joinOrderedInstructions(EquivalenceClasses<Instruction *> &UnionFind,
+                        ArrayRef<Instruction *> ModeledInsts) {
+  SetVector<Instruction *> SeenLeaders;
+  for (Instruction *Inst : ModeledInsts) {
+    if (!isOrderedInstruction(Inst))
+      continue;
+
+    Instruction *Leader = UnionFind.getLeaderValue(Inst);
+    bool Inserted = SeenLeaders.insert(Leader);
+    if (Inserted)
+      continue;
+
+    // Merge statements to close holes. Say, we have already seen statements A
+    // and B, in this order. Then we see an instruction of A again and we would
+    // see the pattern "A B A". This function joins all statements until the
+    // only seen occurrence of A.
+    for (Instruction *Prev : reverse(SeenLeaders)) {
+      // Items added to 'SeenLeaders' are leaders, but may have lost their
+      // leadership status when merged into another statement.
+      Instruction *PrevLeader = UnionFind.getLeaderValue(SeenLeaders.back());
+      if (PrevLeader == Leader)
+        break;
+      UnionFind.unionSets(Prev, Leader);
+    }
+  }
+}
+
+/// Also ensure that the epilogue is the last statement relative to all ordered
+/// instructions.
+///
+/// This is basically joinOrderedInstructions() but using the epilogue as
+/// 'ordered instruction'.
+static void joinAllAfterEpilogue(EquivalenceClasses<Instruction *> &UnionFind,
+                                 ArrayRef<Instruction *> ModeledInsts) {
+  bool EpilogueSeen = false;
+  for (Instruction *Inst : ModeledInsts) {
+    auto PHIWritesLeader = UnionFind.findLeader(nullptr);
+    auto InstLeader = UnionFind.findLeader(Inst);
+
+    if (PHIWritesLeader == InstLeader)
+      EpilogueSeen = true;
+
+    if (!isOrderedInstruction(Inst))
+      continue;
+
+    if (EpilogueSeen)
+      UnionFind.unionSets(PHIWritesLeader, InstLeader);
+  }
+}
+
+void ScopBuilder::buildEqivClassBlockStmts(BasicBlock *BB) {
+  Loop *L = LI.getLoopFor(BB);
+
+  // Extracting out modeled instructions saves us from checking
+  // shouldModelInst() repeatedly.
+  SmallVector<Instruction *, 32> ModeledInsts;
+  EquivalenceClasses<Instruction *> UnionFind;
+  for (Instruction &Inst : *BB) {
+    if (!shouldModelInst(&Inst, L))
+      continue;
+    ModeledInsts.push_back(&Inst);
+    UnionFind.insert(&Inst);
+  }
+
+  // 'nullptr' represents the last statement for a basic block. It contains no
+  // instructions, but holds the PHI write accesses for successor basic blocks.
+  // If a PHI has an incoming value defined in this BB, it can also be merged
+  // with other statements.
+  // TODO: We wouldn't need this if we would add PHIWrites into the statement
+  // that defines the incoming value (if in the BB) instead of always the last,
+  // so we could unconditionally always add a last statement.
+  UnionFind.insert(nullptr);
+
+  joinOperandTree(UnionFind, ModeledInsts);
+  joinIncomingPHIValuesIntoEpilogue(UnionFind, ModeledInsts, BB);
+  joinOrderedInstructions(UnionFind, ModeledInsts);
+  joinAllAfterEpilogue(UnionFind, ModeledInsts);
+
+  // The list of instructions for statement (statement represented by the leader
+  // instruction). The order of statements instructions is reversed such that
+  // the epilogue is first. This makes it easier to ensure that the epilogue is
+  // the last statement.
+  MapVector<Instruction *, std::vector<Instruction *>> LeaderToInstList;
+
+  // Ensure that the epilogue is last.
+  LeaderToInstList[nullptr];
+
+  // Collect the instructions of all leaders. UnionFind's member iterator
+  // unfortunately are not in any specific order.
+  for (Instruction &Inst : reverse(*BB)) {
+    auto LeaderIt = UnionFind.findLeader(&Inst);
+    if (LeaderIt == UnionFind.member_end())
+      continue;
+
+    std::vector<Instruction *> &InstList = LeaderToInstList[*LeaderIt];
+    InstList.push_back(&Inst);
+  }
+
+  // Finally build the statements.
+  int Count = 0;
+  for (auto &Instructions : reverse(LeaderToInstList)) {
+    std::vector<Instruction *> &InstList = Instructions.second;
+    std::reverse(InstList.begin(), InstList.end());
+    scop->addScopStmt(BB, L, std::move(InstList), Count);
+    Count += 1;
+  }
+}
+
 void ScopBuilder::buildStmts(Region &SR) {
   if (scop->isNonAffineSubRegion(&SR)) {
     std::vector<Instruction *> Instructions;
@@ -722,6 +897,9 @@ void ScopBuilder::buildStmts(Region &SR) {
       case GranularityChoice::BasicBlocks:
         buildSequentialBlockStmts(BB);
         break;
+      case GranularityChoice::ScalarIndepependence:
+        buildEqivClassBlockStmts(BB);
+        break;
       }
     }
 }

diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll
@@ -0,0 +1,68 @@
+; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-scops -analyze < %s | FileCheck %s -match-full-lines
+;
+; Split a block into two independent statements that share no scalar.
+; This case has the instructions of the two statements interleaved, such that
+; splitting the BasicBlock in the middle would cause a scalar dependency.
+;
+; for (int j = 0; j < n; j += 1) {
+; body:
+;   double valA = A[0];
+;   double valB = 21.0 + 21.0;
+;   A[0] = valA;
+;   B[0] = valB;
+; }
+;
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B) {
+entry:
+  br label %for
+
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %body, label %exit
+
+    body:
+      %valA = load double, double* %A
+      %valB = fadd double 21.0, 21.0
+      store double %valA, double* %A
+      store double %valB, double* %B
+      br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Statements {
+; CHECK-NEXT:     Stmt_body
+; CHECK-NEXT:         Domain :=
+; CHECK-NEXT:             [n] -> { Stmt_body[i0] : 0 <= i0 < n };
+; CHECK-NEXT:         Schedule :=
+; CHECK-NEXT:             [n] -> { Stmt_body[i0] -> [i0, 0] };
+; CHECK-NEXT:         ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:             [n] -> { Stmt_body[i0] -> MemRef_A[0] };
+; CHECK-NEXT:         MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:             [n] -> { Stmt_body[i0] -> MemRef_A[0] };
+; CHECK-NEXT:         Instructions {
+; CHECK-NEXT:               %valA = load double, double* %A
+; CHECK-NEXT:               store double %valA, double* %A
+; CHECK-NEXT:         }
+; CHECK-NEXT:     Stmt_body1
+; CHECK-NEXT:         Domain :=
+; CHECK-NEXT:             [n] -> { Stmt_body1[i0] : 0 <= i0 < n };
+; CHECK-NEXT:         Schedule :=
+; CHECK-NEXT:             [n] -> { Stmt_body1[i0] -> [i0, 1] };
+; CHECK-NEXT:         MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:             [n] -> { Stmt_body1[i0] -> MemRef_B[0] };
+; CHECK-NEXT:         Instructions {
+; CHECK-NEXT:               %valB = fadd double 2.100000e+01, 2.100000e+01
+; CHECK-NEXT:               store double %valB, double* %B
+; CHECK-NEXT:         }
+; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
@@ -0,0 +1,68 @@
+; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-scops -analyze < %s | FileCheck %s -match-full-lines
+;
+; Split a block into two independent statements that share no scalar.
+; This case has an independent statement just for PHI writes.
+;
+; for (int j = 0; j < n; j += 1) {
+; bodyA:
+;   double valA = A[0];
+;   A[0] = valA;
+;
+; bodyB:
+;   phi = 42.0;
+; }
+;
+define void @func(i32 %n, double* noalias nonnull %A) {
+entry:
+  br label %for
+
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %bodyA, label %exit
+
+    bodyA:
+      %valA = load double, double* %A
+      store double %valA, double* %A
+      br label %bodyB
+
+    bodyB:
+      %phi = phi double [42.0, %bodyA]
+      br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Statements {
+; CHECK-NEXT:     Stmt_bodyA
+; CHECK-NEXT:         Domain :=
+; CHECK-NEXT:             [n] -> { Stmt_bodyA[i0] : 0 <= i0 < n };
+; CHECK-NEXT:         Schedule :=
+; CHECK-NEXT:             [n] -> { Stmt_bodyA[i0] -> [i0, 0] };
+; CHECK-NEXT:         ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:             [n] -> { Stmt_bodyA[i0] -> MemRef_A[0] };
+; CHECK-NEXT:         MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:             [n] -> { Stmt_bodyA[i0] -> MemRef_A[0] };
+; CHECK-NEXT:         Instructions {
+; CHECK-NEXT:             %valA = load double, double* %A
+; CHECK-NEXT:             store double %valA, double* %A
+; CHECK-NEXT:         }
+; CHECK-NEXT:     Stmt_bodyA1
+; CHECK-NEXT:         Domain :=
+; CHECK-NEXT:             [n] -> { Stmt_bodyA1[i0] : 0 <= i0 < n };
+; CHECK-NEXT:         Schedule :=
+; CHECK-NEXT:             [n] -> { Stmt_bodyA1[i0] -> [i0, 1] };
+; CHECK-NEXT:         MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 1]
+; CHECK-NEXT:             [n] -> { Stmt_bodyA1[i0] -> MemRef_phi__phi[] };
+; CHECK-NEXT:         Instructions {
+; CHECK-NEXT:         }
+; CHECK-NEXT: }