Remove 4,096 loop scale limitation.

Summary: This is part 1 of fixes to address the problems described in https://llvm.org/bugs/show_bug.cgi?id=22719. The restriction to limit loop scales to 4,096 does not really prevent overflows anymore, as the underlying algorithm has changed and does not seem to suffer from this problem. Additionally, artificially restricting loop scales to such a low number skews frequency information, making loops of equal hotness appear to have very different hotness properties. The only loops that are artificially restricted to a scale of 4096 are infinite loops (those loops with an exit mass of 0). This prevents infinite loops from skewing the frequencies of other regions in the CFG. At the end of propagation, frequencies are scaled to values that take no more than 64 bits to represent. When the range of frequencies to be represented fits within 61 bits, it pushes up the scaling factor to a minimum of 8 to better distinguish small frequency values. Otherwise, small frequency values are all saturated down at 1. Tested on x86_64. Reviewers: dexonsmith Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D8718 llvm-svn: 233826
llvm · Apr 1, 2015 · a354f48 · a354f48
1 parent f55ec07
commit a354f48
Show file tree

Hide file tree

Showing 4 changed files with 239 additions and 25 deletions.
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -718,9 +718,6 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
 ///
 /// It has some known flaws.
 ///
-///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
-///     BlockFrequency's 64-bit integer precision.
-///
 ///   - The model of irreducible control flow is a rough approximation.
 ///
 ///     Modelling irreducible control flow exactly involves setting up and

diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -331,32 +331,35 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
   return true;
 }
 
-/// \brief Get the maximum allowed loop scale.
-///
-/// Gives the maximum number of estimated iterations allowed for a loop.  Very
-/// large numbers cause problems downstream (even within 64-bits).
-static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); }
-
 /// \brief Compute the loop scale for a loop.
 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // Compute loop scale.
   DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
 
+  // Infinite loops need special handling. If we give the back edge an infinite
+  // mass, they may saturate all the other scales in the function down to 1,
+  // making all the other region temperatures look exactly the same. Choose an
+  // arbitrary scale to avoid these issues.
+  //
+  // FIXME: An alternate way would be to select a symbolic scale which is later
+  // replaced to be the maximum of all computed scales plus 1. This would
+  // appropriately describe the loop as having a large scale, without skewing
+  // the final frequency computation.
+  const Scaled64 InifiniteLoopScale(1, 12);
+
   // LoopScale == 1 / ExitMass
   // ExitMass == HeadMass - BackedgeMass
   BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
 
-  // Block scale stores the inverse of the scale.
-  Loop.Scale = ExitMass.toScaled().inverse();
+  // Block scale stores the inverse of the scale. If this is an infinite loop,
+  // its exit mass will be zero. In this case, use an arbitrary scale for the
+  // loop scale.
+  Loop.Scale =
+      ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
 
   DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
                << " - " << Loop.BackedgeMass << ")\n"
                << " - scale = " << Loop.Scale << "\n");
-
-  if (Loop.Scale > getMaxLoopScale()) {
-    Loop.Scale = getMaxLoopScale();
-    DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
-  }
 }
 
 /// \brief Package up a loop.
@@ -424,15 +427,24 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
   // Scale the Factor to a size that creates integers.  Ideally, integers would
   // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, the register allocator currently deals poorly
-  // with large numbers.  Instead, push Min up a little from 1 to give some
-  // room to differentiate small, unequal numbers.
-  //
-  // TODO: fix issues downstream so that ScalingFactor can be
-  // Scaled64(1,64)/Max.
-  Scaled64 ScalingFactor = Min.inverse();
-  if ((Max / Min).lg() < 60)
+  // differentiated.  However, in the presence of large frequency values, small
+  // frequencies are scaled down to 1, making it impossible to differentiate
+  // small, unequal numbers. When the spread between Min and Max frequencies
+  // fits well within MaxBits, we make the scale be at least 8.
+  const unsigned MaxBits = 64;
+  const unsigned SpreadBits = (Max / Min).lg();
+  Scaled64 ScalingFactor;
+  if (SpreadBits <= MaxBits - 3) {
+    // If the values are small enough, make the scaling factor at least 8 to
+    // allow distinguishing small values.
+    ScalingFactor = Min.inverse();
     ScalingFactor <<= 3;
+  } else {
+    // If the values need more than MaxBits to be represented, saturate small
+    // frequency values down to 1 by using a scaling factor that benefits large
+    // frequency values.
+    ScalingFactor = Scaled64(1, MaxBits) / Max;
+  }
 
   // Translate the floats to integers.
   DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll b/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll
@@ -32,7 +32,8 @@ define void @infinite_loop(i1 %x) {
 entry:
   br i1 %x, label %for.body, label %for.end, !prof !1
 
-; Check that the loop scale maxes out at 4096, giving 2048 here.
+; Check that the infinite loop is arbitrarily scaled to max out at 4096,
+; giving 2048 here.
 ; CHECK-NEXT: for.body: float = 2048.0,
 for.body:
   %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -0,0 +1,204 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; This code contains three loops. One is triple-nested, the
+; second is double nested and the third is a single loop. At
+; runtime, all three loops execute 1,000,000 times each. We use to
+; give different frequencies to each of the loops because loop
+; scales were limited to no more than 4,096.
+;
+; This was penalizing the hotness of the second and third loops
+; because BFI was reducing the loop scale for for.cond16 and
+; for.cond26 to a max of 4,096.
+;
+; Without this restriction, all loops are now correctly given the same
+; frequency values.
+;
+; Original C code:
+;
+;
+; int g;
+; __attribute__((noinline)) void bar() {
+;  g++;
+; }
+;
+; extern int printf(const char*, ...);
+;
+; int main()
+; {
+;   int i, j, k;
+;
+;   g = 0;
+;   for (i = 0; i < 100; i++)
+;     for (j = 0; j < 100; j++)
+;        for (k = 0; k < 100; k++)
+;            bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+;
+;   for (i = 0; i < 100; i++)
+;     for (j = 0; j < 10000; j++)
+;         bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+;
+;
+;   for (i = 0; i < 1000000; i++)
+;     bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+; }
+
+@g = common global i32 0, align 4
+@.str = private unnamed_addr constant [8 x i8] c"g = %d\0A\00", align 1
+
+declare void @bar()
+declare i32 @printf(i8*, ...)
+
+; CHECK: Printing analysis {{.*}} for function 'main':
+; CHECK-NEXT: block-frequency-info: main
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end12, !prof !1
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %j, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %1 = load i32, i32* %j, align 4
+  %cmp2 = icmp slt i32 %1, 100
+  br i1 %cmp2, label %for.body3, label %for.end9, !prof !2
+
+for.body3:                                        ; preds = %for.cond1
+  store i32 0, i32* %k, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %2 = load i32, i32* %k, align 4
+  %cmp5 = icmp slt i32 %2, 100
+  br i1 %cmp5, label %for.body6, label %for.end, !prof !3
+
+; CHECK: - for.body6: float = 500000.5, int = 4000003
+for.body6:                                        ; preds = %for.cond4
+  call void @bar()
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %3 = load i32, i32* %k, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %k, align 4
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %4 = load i32, i32* %j, align 4
+  %inc8 = add nsw i32 %4, 1
+  store i32 %inc8, i32* %j, align 4
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %5 = load i32, i32* %i, align 4
+  %inc11 = add nsw i32 %5, 1
+  store i32 %inc11, i32* %i, align 4
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  %6 = load i32, i32* @g, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %6)
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond13
+
+for.cond13:                                       ; preds = %for.inc22, %for.end12
+  %7 = load i32, i32* %i, align 4
+  %cmp14 = icmp slt i32 %7, 100
+  br i1 %cmp14, label %for.body15, label %for.end24, !prof !1
+
+for.body15:                                       ; preds = %for.cond13
+  store i32 0, i32* %j, align 4
+  br label %for.cond16
+
+for.cond16:                                       ; preds = %for.inc19, %for.body15
+  %8 = load i32, i32* %j, align 4
+  %cmp17 = icmp slt i32 %8, 10000
+  br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
+
+; CHECK: - for.body18: float = 500000.5, int = 4000003
+for.body18:                                       ; preds = %for.cond16
+  call void @bar()
+  br label %for.inc19
+
+for.inc19:                                        ; preds = %for.body18
+  %9 = load i32, i32* %j, align 4
+  %inc20 = add nsw i32 %9, 1
+  store i32 %inc20, i32* %j, align 4
+  br label %for.cond16
+
+for.end21:                                        ; preds = %for.cond16
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.end21
+  %10 = load i32, i32* %i, align 4
+  %inc23 = add nsw i32 %10, 1
+  store i32 %inc23, i32* %i, align 4
+  br label %for.cond13
+
+for.end24:                                        ; preds = %for.cond13
+  %11 = load i32, i32* @g, align 4
+  %call25 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %11)
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.inc29, %for.end24
+  %12 = load i32, i32* %i, align 4
+  %cmp27 = icmp slt i32 %12, 1000000
+  br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
+
+; CHECK: - for.body28: float = 500000.5, int = 4000003
+for.body28:                                       ; preds = %for.cond26
+  call void @bar()
+  br label %for.inc29
+
+for.inc29:                                        ; preds = %for.body28
+  %13 = load i32, i32* %i, align 4
+  %inc30 = add nsw i32 %13, 1
+  store i32 %inc30, i32* %i, align 4
+  br label %for.cond26
+
+for.end31:                                        ; preds = %for.cond26
+  %14 = load i32, i32* @g, align 4
+  %call32 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %14)
+  store i32 0, i32* @g, align 4
+  %15 = load i32, i32* %retval
+  ret i32 %15
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
+!1 = !{!"branch_weights", i32 101, i32 2}
+!2 = !{!"branch_weights", i32 10001, i32 101}
+!3 = !{!"branch_weights", i32 1000001, i32 10001}
+!4 = !{!"branch_weights", i32 1000001, i32 101}
+!5 = !{!"branch_weights", i32 1000001, i32 2}