Recommit r325001: [CallSiteSplitting] Support splitting of blocks wit…

…h instrs before call. For basic blocks with instructions between the beginning of the block and a call we have to duplicate the instructions before the call in all split blocks and add PHI nodes for uses of the duplicated instructions after the call. Currently, the threshold for the number of instructions before a call is quite low, to keep the impact on binary size low. Reviewers: junbuml, mcrosier, davidxl, davide Reviewed By: junbuml Differential Revision: https://reviews.llvm.org/D41860 llvm-svn: 325126
llvm · Feb 14, 2018 · b4e3bad · b4e3bad
1 parent de300e6
commit b4e3bad
Show file tree

Hide file tree

Showing 7 changed files with 429 additions and 151 deletions.
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -59,11 +59,13 @@
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -73,6 +75,15 @@ using namespace PatternMatch;
 
 STATISTIC(NumCallSiteSplit, "Number of call-site split");
 
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+    DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+                         cl::desc("Only allow instructions before a call, if "
+                                  "their cost is below DuplicationThreshold"),
+                         cl::init(5));
+
 static void addNonNullAttribute(CallSite CS, Value *Op) {
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
@@ -168,20 +179,26 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
   return Preds;
 }
 
-static bool canSplitCallSite(CallSite CS) {
+static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
   // FIXME: As of now we handle only CallInst. InvokeInst could be handled
   // without too much effort.
   Instruction *Instr = CS.getInstruction();
   if (!isa<CallInst>(Instr))
     return false;
 
-  // Allow splitting a call-site only when there is no instruction before the
-  // call-site in the basic block. Based on this constraint, we only clone the
-  // call instruction, and we do not move a call-site across any other
-  // instruction.
   BasicBlock *CallSiteBB = Instr->getParent();
-  if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
-    return false;
+  // Allow splitting a call-site only when the CodeSize cost of the
+  // instructions before the call is less then DuplicationThreshold. The
+  // instructions before the call will be duplicated in the split blocks and
+  // corresponding uses will be updated.
+  unsigned Cost = 0;
+  for (auto &InstBeforeCall :
+       llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) {
+    Cost += TTI.getInstructionCost(&InstBeforeCall,
+                                   TargetTransformInfo::TCK_CodeSize);
+    if (Cost >= DuplicationThreshold)
+      return false;
+  }
 
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
   SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
@@ -246,16 +263,21 @@ static void splitCallSite(
     CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
 
   DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
-  for (const auto &P : Preds) {
-    BasicBlock *PredBB = P.first;
-    BasicBlock *SplitBlock =
-        SplitBlockPredecessors(TailBB, PredBB, ".predBB.split");
+
+  assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+  // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+  // here.
+  ValueToValueMapTy ValueToValueMaps[2];
+  for (unsigned i = 0; i < Preds.size(); i++) {
+    BasicBlock *PredBB = Preds[i].first;
+    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+        TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i]);
     assert(SplitBlock && "Unexpected new basic block split.");
 
-    Instruction *NewCI = Instr->clone();
+    Instruction *NewCI =
+        &*std::prev(SplitBlock->getTerminator()->getIterator());
     CallSite NewCS(NewCI);
-    addConditions(NewCS, P.second);
-    NewCI->insertBefore(&*SplitBlock->getFirstInsertionPt());
+    addConditions(NewCS, Preds[i].second);
 
     // Handle PHIs used as arguments in the call-site.
     for (PHINode &PN : TailBB->phis()) {
@@ -273,13 +295,41 @@ static void splitCallSite(
       CallPN->addIncoming(NewCI, SplitBlock);
   }
 
+  auto *OriginalBegin = &*TailBB->begin();
   // Replace users of the original call with a PHI mering call-sites split.
   if (CallPN) {
-    CallPN->insertBefore(TailBB->getFirstNonPHI());
+    CallPN->insertBefore(OriginalBegin);
     Instr->replaceAllUsesWith(CallPN);
   }
 
-  Instr->eraseFromParent();
+  // Remove instructions moved to split blocks from TailBB, from the duplicated
+  // call instruction to the beginning of the basic block. If an instruction
+  // has any uses, add a new PHI node to combine the values coming from the
+  // split blocks. The new PHI nodes are placed before the first original
+  // instruction, so we do not end up deleting them. By using reverse-order, we
+  // do not introduce unnecessary PHI nodes for def-use chains from the call
+  // instruction to the beginning of the block.
+  auto I = Instr->getReverseIterator();
+  while (I != TailBB->rend()) {
+    Instruction *CurrentI = &*I++;
+    if (!CurrentI->use_empty()) {
+      // If an existing PHI has users after the call, there is no need to create
+      // a new one.
+      if (isa<PHINode>(CurrentI))
+        continue;
+      PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+      for (auto &Mapping : ValueToValueMaps)
+        NewPN->addIncoming(Mapping[CurrentI],
+                           cast<Instruction>(Mapping[CurrentI])->getParent());
+      NewPN->insertBefore(&*TailBB->begin());
+      CurrentI->replaceAllUsesWith(NewPN);
+    }
+    CurrentI->eraseFromParent();
+    // We are done once we handled the first original instruction in TailBB.
+    if (CurrentI == OriginalBegin)
+      break;
+  }
+
   NumCallSiteSplit++;
 }
 
@@ -344,14 +394,15 @@ static bool tryToSplitOnPredicatedArgument(CallSite CS) {
   return true;
 }
 
-static bool tryToSplitCallSite(CallSite CS) {
-  if (!CS.arg_size() || !canSplitCallSite(CS))
+static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
+  if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
     return false;
   return tryToSplitOnPredicatedArgument(CS) ||
          tryToSplitOnPHIPredicatedArgument(CS);
 }
 
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+                                TargetTransformInfo &TTI) {
   bool Changed = false;
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
     BasicBlock &BB = *BI++;
@@ -364,7 +415,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
       Function *Callee = CS.getCalledFunction();
       if (!Callee || Callee->isDeclaration())
         continue;
-      Changed |= tryToSplitCallSite(CS);
+      Changed |= tryToSplitCallSite(CS, TTI);
     }
   }
   return Changed;
@@ -379,6 +430,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -387,7 +439,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
       return false;
 
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return doCallSiteSplitting(F, TLI);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return doCallSiteSplitting(F, TLI, TTI);
   }
 };
 } // namespace
@@ -396,6 +449,7 @@ char CallSiteSplittingLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
                       "Call-site splitting", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
                     "Call-site splitting", false, false)
 FunctionPass *llvm::createCallSiteSplittingPass() {
@@ -405,8 +459,9 @@ FunctionPass *llvm::createCallSiteSplittingPass() {
 PreservedAnalyses CallSiteSplittingPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
 
-  if (!doCallSiteSplitting(F, TLI))
+  if (!doCallSiteSplitting(F, TLI, TTI))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   return PA;

diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -35,6 +35,7 @@
 ; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
 ; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo
 ; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo
+; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo
 ; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O2-NEXT: PGOIndirectCallPromotion
 ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis
@@ -79,7 +80,6 @@
 ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}PostOrderFunctionAttrsPass>
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo
 ; CHECK-O2-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O2-NEXT: Running pass: CrossDSOCFIPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}SimplifyCFGPass>