Skip to content

Commit

Permalink
Remove 4,096 loop scale limitation.
Browse files Browse the repository at this point in the history
Summary:
This is part 1 of fixes to address the problems described in
https://llvm.org/bugs/show_bug.cgi?id=22719.

The restriction to limit loop scales to 4,096 does not really prevent
overflows anymore, as the underlying algorithm has changed and does
not seem to suffer from this problem.

Additionally, artificially restricting loop scales to such a low number
skews frequency information, making loops of equal hotness appear to
have very different hotness properties.

The only loops that are artificially restricted to a scale of 4096 are
infinite loops (those loops with an exit mass of 0). This prevents
infinite loops from skewing the frequencies of other regions in the CFG.

At the end of propagation, frequencies are scaled to values that take no
more than 64 bits to represent. When the range of frequencies to be
represented fits within 61 bits, it pushes up the scaling factor to a
minimum of 8 to better distinguish small frequency values. Otherwise,
small frequency values are all saturated down at 1.

Tested on x86_64.

Reviewers: dexonsmith

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D8718

llvm-svn: 233826
  • Loading branch information
dnovillo committed Apr 1, 2015
1 parent f55ec07 commit a354f48
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 25 deletions.
3 changes: 0 additions & 3 deletions llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
Expand Up @@ -718,9 +718,6 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
///
/// It has some known flaws.
///
/// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
/// BlockFrequency's 64-bit integer precision.
///
/// - The model of irreducible control flow is a rough approximation.
///
/// Modelling irreducible control flow exactly involves setting up and
Expand Down
54 changes: 33 additions & 21 deletions llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
Expand Up @@ -331,32 +331,35 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
return true;
}

/// \brief Get the maximum allowed loop scale.
///
/// Gives the maximum number of estimated iterations allowed for a loop. Very
/// large numbers cause problems downstream (even within 64-bits).
static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); }

/// \brief Compute the loop scale for a loop.
void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
// Compute loop scale.
DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");

// Infinite loops need special handling. If we give the back edge an infinite
// mass, they may saturate all the other scales in the function down to 1,
// making all the other region temperatures look exactly the same. Choose an
// arbitrary scale to avoid these issues.
//
// FIXME: An alternate way would be to select a symbolic scale which is later
// replaced to be the maximum of all computed scales plus 1. This would
// appropriately describe the loop as having a large scale, without skewing
// the final frequency computation.
const Scaled64 InifiniteLoopScale(1, 12);

// LoopScale == 1 / ExitMass
// ExitMass == HeadMass - BackedgeMass
BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;

// Block scale stores the inverse of the scale.
Loop.Scale = ExitMass.toScaled().inverse();
// Block scale stores the inverse of the scale. If this is an infinite loop,
// its exit mass will be zero. In this case, use an arbitrary scale for the
// loop scale.
Loop.Scale =
ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();

DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
<< " - " << Loop.BackedgeMass << ")\n"
<< " - scale = " << Loop.Scale << "\n");

if (Loop.Scale > getMaxLoopScale()) {
Loop.Scale = getMaxLoopScale();
DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
}
}

/// \brief Package up a loop.
Expand Down Expand Up @@ -424,15 +427,24 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
const Scaled64 &Min, const Scaled64 &Max) {
// Scale the Factor to a size that creates integers. Ideally, integers would
// be scaled so that Max == UINT64_MAX so that they can be best
// differentiated. However, the register allocator currently deals poorly
// with large numbers. Instead, push Min up a little from 1 to give some
// room to differentiate small, unequal numbers.
//
// TODO: fix issues downstream so that ScalingFactor can be
// Scaled64(1,64)/Max.
Scaled64 ScalingFactor = Min.inverse();
if ((Max / Min).lg() < 60)
// differentiated. However, in the presence of large frequency values, small
// frequencies are scaled down to 1, making it impossible to differentiate
// small, unequal numbers. When the spread between Min and Max frequencies
// fits well within MaxBits, we make the scale be at least 8.
const unsigned MaxBits = 64;
const unsigned SpreadBits = (Max / Min).lg();
Scaled64 ScalingFactor;
if (SpreadBits <= MaxBits - 3) {
// If the values are small enough, make the scaling factor at least 8 to
// allow distinguishing small values.
ScalingFactor = Min.inverse();
ScalingFactor <<= 3;
} else {
// If the values need more than MaxBits to be represented, saturate small
// frequency values down to 1 by using a scaling factor that benefits large
// frequency values.
ScalingFactor = Scaled64(1, MaxBits) / Max;
}

// Translate the floats to integers.
DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/Analysis/BlockFrequencyInfo/bad_input.ll
Expand Up @@ -32,7 +32,8 @@ define void @infinite_loop(i1 %x) {
entry:
br i1 %x, label %for.body, label %for.end, !prof !1

; Check that the loop scale maxes out at 4096, giving 2048 here.
; Check that the infinite loop is arbitrarily scaled to max out at 4096,
; giving 2048 here.
; CHECK-NEXT: for.body: float = 2048.0,
for.body:
%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
Expand Down
204 changes: 204 additions & 0 deletions llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -0,0 +1,204 @@
; RUN: opt < %s -analyze -block-freq | FileCheck %s

; This code contains three loops. One is triple-nested, the
; second is double nested and the third is a single loop. At
; runtime, all three loops execute 1,000,000 times each. We use to
; give different frequencies to each of the loops because loop
; scales were limited to no more than 4,096.
;
; This was penalizing the hotness of the second and third loops
; because BFI was reducing the loop scale for for.cond16 and
; for.cond26 to a max of 4,096.
;
; Without this restriction, all loops are now correctly given the same
; frequency values.
;
; Original C code:
;
;
; int g;
; __attribute__((noinline)) void bar() {
; g++;
; }
;
; extern int printf(const char*, ...);
;
; int main()
; {
; int i, j, k;
;
; g = 0;
; for (i = 0; i < 100; i++)
; for (j = 0; j < 100; j++)
; for (k = 0; k < 100; k++)
; bar();
;
; printf ("g = %d\n", g);
; g = 0;
;
; for (i = 0; i < 100; i++)
; for (j = 0; j < 10000; j++)
; bar();
;
; printf ("g = %d\n", g);
; g = 0;
;
;
; for (i = 0; i < 1000000; i++)
; bar();
;
; printf ("g = %d\n", g);
; g = 0;
; }

@g = common global i32 0, align 4
@.str = private unnamed_addr constant [8 x i8] c"g = %d\0A\00", align 1

declare void @bar()
declare i32 @printf(i8*, ...)

; CHECK: Printing analysis {{.*}} for function 'main':
; CHECK-NEXT: block-frequency-info: main
define i32 @main() {
entry:
%retval = alloca i32, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%k = alloca i32, align 4
store i32 0, i32* %retval
store i32 0, i32* @g, align 4
store i32 0, i32* %i, align 4
br label %for.cond

for.cond: ; preds = %for.inc10, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 100
br i1 %cmp, label %for.body, label %for.end12, !prof !1

for.body: ; preds = %for.cond
store i32 0, i32* %j, align 4
br label %for.cond1

for.cond1: ; preds = %for.inc7, %for.body
%1 = load i32, i32* %j, align 4
%cmp2 = icmp slt i32 %1, 100
br i1 %cmp2, label %for.body3, label %for.end9, !prof !2

for.body3: ; preds = %for.cond1
store i32 0, i32* %k, align 4
br label %for.cond4

for.cond4: ; preds = %for.inc, %for.body3
%2 = load i32, i32* %k, align 4
%cmp5 = icmp slt i32 %2, 100
br i1 %cmp5, label %for.body6, label %for.end, !prof !3

; CHECK: - for.body6: float = 500000.5, int = 4000003
for.body6: ; preds = %for.cond4
call void @bar()
br label %for.inc

for.inc: ; preds = %for.body6
%3 = load i32, i32* %k, align 4
%inc = add nsw i32 %3, 1
store i32 %inc, i32* %k, align 4
br label %for.cond4

for.end: ; preds = %for.cond4
br label %for.inc7

for.inc7: ; preds = %for.end
%4 = load i32, i32* %j, align 4
%inc8 = add nsw i32 %4, 1
store i32 %inc8, i32* %j, align 4
br label %for.cond1

for.end9: ; preds = %for.cond1
br label %for.inc10

for.inc10: ; preds = %for.end9
%5 = load i32, i32* %i, align 4
%inc11 = add nsw i32 %5, 1
store i32 %inc11, i32* %i, align 4
br label %for.cond

for.end12: ; preds = %for.cond
%6 = load i32, i32* @g, align 4
%call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %6)
store i32 0, i32* @g, align 4
store i32 0, i32* %i, align 4
br label %for.cond13

for.cond13: ; preds = %for.inc22, %for.end12
%7 = load i32, i32* %i, align 4
%cmp14 = icmp slt i32 %7, 100
br i1 %cmp14, label %for.body15, label %for.end24, !prof !1

for.body15: ; preds = %for.cond13
store i32 0, i32* %j, align 4
br label %for.cond16

for.cond16: ; preds = %for.inc19, %for.body15
%8 = load i32, i32* %j, align 4
%cmp17 = icmp slt i32 %8, 10000
br i1 %cmp17, label %for.body18, label %for.end21, !prof !4

; CHECK: - for.body18: float = 500000.5, int = 4000003
for.body18: ; preds = %for.cond16
call void @bar()
br label %for.inc19

for.inc19: ; preds = %for.body18
%9 = load i32, i32* %j, align 4
%inc20 = add nsw i32 %9, 1
store i32 %inc20, i32* %j, align 4
br label %for.cond16

for.end21: ; preds = %for.cond16
br label %for.inc22

for.inc22: ; preds = %for.end21
%10 = load i32, i32* %i, align 4
%inc23 = add nsw i32 %10, 1
store i32 %inc23, i32* %i, align 4
br label %for.cond13

for.end24: ; preds = %for.cond13
%11 = load i32, i32* @g, align 4
%call25 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %11)
store i32 0, i32* @g, align 4
store i32 0, i32* %i, align 4
br label %for.cond26

for.cond26: ; preds = %for.inc29, %for.end24
%12 = load i32, i32* %i, align 4
%cmp27 = icmp slt i32 %12, 1000000
br i1 %cmp27, label %for.body28, label %for.end31, !prof !5

; CHECK: - for.body28: float = 500000.5, int = 4000003
for.body28: ; preds = %for.cond26
call void @bar()
br label %for.inc29

for.inc29: ; preds = %for.body28
%13 = load i32, i32* %i, align 4
%inc30 = add nsw i32 %13, 1
store i32 %inc30, i32* %i, align 4
br label %for.cond26

for.end31: ; preds = %for.cond26
%14 = load i32, i32* @g, align 4
%call32 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0), i32 %14)
store i32 0, i32* @g, align 4
%15 = load i32, i32* %retval
ret i32 %15
}

!llvm.ident = !{!0}

!0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
!1 = !{!"branch_weights", i32 101, i32 2}
!2 = !{!"branch_weights", i32 10001, i32 101}
!3 = !{!"branch_weights", i32 1000001, i32 10001}
!4 = !{!"branch_weights", i32 1000001, i32 101}
!5 = !{!"branch_weights", i32 1000001, i32 2}

0 comments on commit a354f48

Please sign in to comment.