From a354f4889156d3efabed681694f577e5038f6211 Mon Sep 17 00:00:00 2001
From: Diego Novillo <dnovillo@google.com>
Date: Wed, 1 Apr 2015 17:42:27 +0000
Subject: [PATCH] Remove 4,096 loop scale limitation.

Summary:
This is part 1 of fixes to address the problems described in
https://llvm.org/bugs/show_bug.cgi?id=22719.

The restriction to limit loop scales to 4,096 does not really prevent
overflows anymore, as the underlying algorithm has changed and does
not seem to suffer from this problem.

Additionally, artificially restricting loop scales to such a low number
skews frequency information, making loops of equal hotness appear to
have very different hotness properties.

The only loops that are artificially restricted to a scale of 4096 are
infinite loops (those loops with an exit mass of 0). This prevents
infinite loops from skewing the frequencies of other regions in the CFG.

At the end of propagation, frequencies are scaled to values that take no
more than 64 bits to represent. When the range of frequencies to be
represented fits within 61 bits, it pushes up the scaling factor to a
minimum of 8 to better distinguish small frequency values. Otherwise,
small frequency values are all saturated down at 1.

Tested on x86_64.

Reviewers: dexonsmith

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D8718

llvm-svn: 233826
---
 .../llvm/Analysis/BlockFrequencyInfoImpl.h    |   3 -
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp  |  54 +++--
 .../Analysis/BlockFrequencyInfo/bad_input.ll  |   3 +-
 .../loops_with_profile_info.ll                | 204 ++++++++++++++++++
 4 files changed, 239 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll

diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 57b515420925b..9acc863a7382c 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -718,9 +718,6 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
 ///
 /// It has some known flaws.
 ///
-///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
-///     BlockFrequency's 64-bit integer precision.
-///
 ///   - The model of irreducible control flow is a rough approximation.
 ///
 ///     Modelling irreducible control flow exactly involves setting up and
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 278073cd6c52d..456cee179f0bc 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -331,32 +331,35 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
   return true;
 }
 
-/// \brief Get the maximum allowed loop scale.
-///
-/// Gives the maximum number of estimated iterations allowed for a loop.  Very
-/// large numbers cause problems downstream (even within 64-bits).
-static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); }
-
 /// \brief Compute the loop scale for a loop.
 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // Compute loop scale.
   DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
 
+  // Infinite loops need special handling. If we give the back edge an infinite
+  // mass, they may saturate all the other scales in the function down to 1,
+  // making all the other region temperatures look exactly the same. Choose an
+  // arbitrary scale to avoid these issues.
+  //
+  // FIXME: An alternate way would be to select a symbolic scale which is later
+  // replaced to be the maximum of all computed scales plus 1. This would
+  // appropriately describe the loop as having a large scale, without skewing
+  // the final frequency computation.
+  const Scaled64 InifiniteLoopScale(1, 12);
+
   // LoopScale == 1 / ExitMass
   // ExitMass == HeadMass - BackedgeMass
   BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
 
-  // Block scale stores the inverse of the scale.
-  Loop.Scale = ExitMass.toScaled().inverse();
+  // Block scale stores the inverse of the scale. If this is an infinite loop,
+  // its exit mass will be zero. In this case, use an arbitrary scale for the
+  // loop scale.
+  Loop.Scale =
+      ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
 
   DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
                << " - " << Loop.BackedgeMass << ")\n"
                << " - scale = " << Loop.Scale << "\n");
-
-  if (Loop.Scale > getMaxLoopScale()) {
-    Loop.Scale = getMaxLoopScale();
-    DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
-  }
 }
 
 /// \brief Package up a loop.
@@ -424,15 +427,24 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
   // Scale the Factor to a size that creates integers.  Ideally, integers would
   // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, the register allocator currently deals poorly
-  // with large numbers.  Instead, push Min up a little from 1 to give some
-  // room to differentiate small, unequal numbers.
-  //
-  // TODO: fix issues downstream so that ScalingFactor can be
-  // Scaled64(1,64)/Max.
-  Scaled64 ScalingFactor = Min.inverse();
-  if ((Max / Min).lg() < 60)
+  // differentiated.  However, in the presence of large frequency values, small
+  // frequencies are scaled down to 1, making it impossible to differentiate
+  // small, unequal numbers. When the spread between Min and Max frequencies
+  // fits well within MaxBits, we make the scale be at least 8.
+  const unsigned MaxBits = 64;
+  const unsigned SpreadBits = (Max / Min).lg();
+  Scaled64 ScalingFactor;
+  if (SpreadBits <= MaxBits - 3) {
+    // If the values are small enough, make the scaling factor at least 8 to
+    // allow distinguishing small values.
+    ScalingFactor = Min.inverse();
     ScalingFactor <<= 3;
+  } else {
+    // If the values need more than MaxBits to be represented, saturate small
+    // frequency values down to 1 by using a scaling factor that benefits large
+    // frequency values.
+    ScalingFactor = Scaled64(1, MaxBits) / Max;
+  }
 
   // Translate the floats to integers.
   DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll b/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll
index da62dca67a8b8..e5b1f500e1e64 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll
@@ -32,7 +32,8 @@ define void @infinite_loop(i1 %x) {
 entry:
   br i1 %x, label %for.body, label %for.end, !prof !1
 
-; Check that the loop scale maxes out at 4096, giving 2048 here.
+; Check that the infinite loop is arbitrarily scaled to max out at 4096,
+; giving 2048 here.
 ; CHECK-NEXT: for.body: float = 2048.0,
 for.body:
   %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
new file mode 100644
index 0000000000000..e5e3f64b649b8
--- /dev/null
+++ b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -0,0 +1,204 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; This code contains three loops. One is triple-nested, the
+; second is double nested and the third is a single loop. At
+; runtime, all three loops execute 1,000,000 times each. We use to
+; give different frequencies to each of the loops because loop
+; scales were limited to no more than 4,096.
+;
+; This was penalizing the hotness of the second and third loops
+; because BFI was reducing the loop scale for for.cond16 and
+; for.cond26 to a max of 4,096.
+;
+; Without this restriction, all loops are now correctly given the same
+; frequency values.
+;
+; Original C code:
+;
+;
+; int g;
+; __attribute__((noinline)) void bar() {
+;  g++;
+; }
+;
+; extern int printf(const char*, ...);
+;
+; int main()
+; {
+;   int i, j, k;
+;
+;   g = 0;
+;   for (i = 0; i < 100; i++)
+;     for (j = 0; j < 100; j++)
+;        for (k = 0; k < 100; k++)
+;            bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+;
+;   for (i = 0; i < 100; i++)
+;     for (j = 0; j < 10000; j++)
+;         bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+;
+;
+;   for (i = 0; i < 1000000; i++)
+;     bar();
+;
+;   printf ("g = %d\n", g);
+;   g = 0;
+; }
+
+@g = common global i32 0, align 4
+@.str = private unnamed_addr constant [8 x i8] c"g = %d\0A\00", align 1
+
+declare void @bar()
+declare i32 @printf(i8*, ...)
+
+; CHECK: Printing analysis {{.*}} for function 'main':
+; CHECK-NEXT: block-frequency-info: main
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end12, !prof !1
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %j, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %1 = load i32, i32* %j, align 4
+  %cmp2 = icmp slt i32 %1, 100
+  br i1 %cmp2, label %for.body3, label %for.end9, !prof !2
+
+for.body3:                                        ; preds = %for.cond1
+  store i32 0, i32* %k, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %2 = load i32, i32* %k, align 4
+  %cmp5 = icmp slt i32 %2, 100
+  br i1 %cmp5, label %for.body6, label %for.end, !prof !3
+
+; CHECK: - for.body6: float = 500000.5, int = 4000003
+for.body6:                                        ; preds = %for.cond4
+  call void @bar()
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %3 = load i32, i32* %k, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %k, align 4
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %4 = load i32, i32* %j, align 4
+  %inc8 = add nsw i32 %4, 1
+  store i32 %inc8, i32* %j, align 4
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %5 = load i32, i32* %i, align 4
+  %inc11 = add nsw i32 %5, 1
+  store i32 %inc11, i32* %i, align 4
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  %6 = load i32, i32* @g, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %6)
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond13
+
+for.cond13:                                       ; preds = %for.inc22, %for.end12
+  %7 = load i32, i32* %i, align 4
+  %cmp14 = icmp slt i32 %7, 100
+  br i1 %cmp14, label %for.body15, label %for.end24, !prof !1
+
+for.body15:                                       ; preds = %for.cond13
+  store i32 0, i32* %j, align 4
+  br label %for.cond16
+
+for.cond16:                                       ; preds = %for.inc19, %for.body15
+  %8 = load i32, i32* %j, align 4
+  %cmp17 = icmp slt i32 %8, 10000
+  br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
+
+; CHECK: - for.body18: float = 500000.5, int = 4000003
+for.body18:                                       ; preds = %for.cond16
+  call void @bar()
+  br label %for.inc19
+
+for.inc19:                                        ; preds = %for.body18
+  %9 = load i32, i32* %j, align 4
+  %inc20 = add nsw i32 %9, 1
+  store i32 %inc20, i32* %j, align 4
+  br label %for.cond16
+
+for.end21:                                        ; preds = %for.cond16
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.end21
+  %10 = load i32, i32* %i, align 4
+  %inc23 = add nsw i32 %10, 1
+  store i32 %inc23, i32* %i, align 4
+  br label %for.cond13
+
+for.end24:                                        ; preds = %for.cond13
+  %11 = load i32, i32* @g, align 4
+  %call25 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %11)
+  store i32 0, i32* @g, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.inc29, %for.end24
+  %12 = load i32, i32* %i, align 4
+  %cmp27 = icmp slt i32 %12, 1000000
+  br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
+
+; CHECK: - for.body28: float = 500000.5, int = 4000003
+for.body28:                                       ; preds = %for.cond26
+  call void @bar()
+  br label %for.inc29
+
+for.inc29:                                        ; preds = %for.body28
+  %13 = load i32, i32* %i, align 4
+  %inc30 = add nsw i32 %13, 1
+  store i32 %inc30, i32* %i, align 4
+  br label %for.cond26
+
+for.end31:                                        ; preds = %for.cond26
+  %14 = load i32, i32* @g, align 4
+  %call32 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %14)
+  store i32 0, i32* @g, align 4
+  %15 = load i32, i32* %retval
+  ret i32 %15
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
+!1 = !{!"branch_weights", i32 101, i32 2}
+!2 = !{!"branch_weights", i32 10001, i32 101}
+!3 = !{!"branch_weights", i32 1000001, i32 10001}
+!4 = !{!"branch_weights", i32 1000001, i32 101}
+!5 = !{!"branch_weights", i32 1000001, i32 2}