-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ObjectSizeOffsetVisitor] Bail after visiting 100 instructions #67479
Conversation
We're running into stack overflows for huge functions with lots of phis. Even without the stack overflows, this is recursing >7000 in some auto-generated code. This fixes the stack overflow and brings down the compile time to something reasonable.
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms ChangesWe're running into stack overflows for huge functions with lots of phis. Even without the stack overflows, this is recursing >7000 in some auto-generated code. This fixes the stack overflow and brings down the compile time to something reasonable. Full diff: https://github.com/llvm/llvm-project/pull/67479.diff 3 Files Affected:
diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h
index 1d9831889f573ab..49e5b69898427d1 100644
--- a/llvm/include/llvm/Analysis/MemoryBuiltins.h
+++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h
@@ -199,6 +199,7 @@ class ObjectSizeOffsetVisitor
unsigned IntTyBits;
APInt Zero;
SmallDenseMap<Instruction *, SizeOffsetType, 8> SeenInsts;
+ unsigned RecurseDepth = 0;
APInt align(APInt Size, MaybeAlign Align);
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 9eab801c3ccbb97..d727e8214dc7938 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -50,6 +51,12 @@ using namespace llvm;
#define DEBUG_TYPE "memory-builtins"
+static cl::opt<unsigned> ObjectSizeOffsetVisitorMaxRecurseDepth(
+ "object-size-offset-visitor-max-recurse-depth",
+ cl::desc(
+ "Maximum number of PHIs for ObjectSizeOffsetVisitor to look through"),
+ cl::init(100));
+
enum AllocType : uint8_t {
OpNewLike = 1<<0, // allocates; never returns null
MallocLike = 1<<1, // allocates; may return null
@@ -994,14 +1001,20 @@ SizeOffsetType ObjectSizeOffsetVisitor::combineSizeOffset(SizeOffsetType LHS,
}
SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode &PN) {
- if (PN.getNumIncomingValues() == 0)
+ if (PN.getNumIncomingValues() == 0 ||
+ RecurseDepth >= ObjectSizeOffsetVisitorMaxRecurseDepth)
return unknown();
+
+ ++RecurseDepth;
auto IncomingValues = PN.incoming_values();
- return std::accumulate(IncomingValues.begin() + 1, IncomingValues.end(),
- compute(*IncomingValues.begin()),
- [this](SizeOffsetType LHS, Value *VRHS) {
- return combineSizeOffset(LHS, compute(VRHS));
- });
+ SizeOffsetType Ret =
+ std::accumulate(IncomingValues.begin() + 1, IncomingValues.end(),
+ compute(*IncomingValues.begin()),
+ [this](SizeOffsetType LHS, Value *VRHS) {
+ return combineSizeOffset(LHS, compute(VRHS));
+ });
+ --RecurseDepth;
+ return Ret;
}
SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
diff --git a/llvm/test/Transforms/DeadStoreElimination/object-size-offset-visitor-max-recursion.ll b/llvm/test/Transforms/DeadStoreElimination/object-size-offset-visitor-max-recursion.ll
new file mode 100644
index 000000000000000..ee6db23f13581d4
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/object-size-offset-visitor-max-recursion.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=dse -S -object-size-offset-visitor-max-recurse-depth=1 < %s | FileCheck %s --check-prefix=NO
+; RUN: opt -passes=dse -S -object-size-offset-visitor-max-recurse-depth=2 < %s | FileCheck %s --check-prefix=YES
+
+declare void @use(ptr)
+
+define void @f(i32 %i, i1 %c) {
+; NO-LABEL: define void @f(
+; NO-SAME: i32 [[I:%.*]], i1 [[C:%.*]]) {
+; NO-NEXT: b0:
+; NO-NEXT: [[A1:%.*]] = alloca i32, align 4
+; NO-NEXT: [[A2:%.*]] = alloca i32, align 4
+; NO-NEXT: br i1 [[C]], label [[B1:%.*]], label [[B2:%.*]]
+; NO: b1:
+; NO-NEXT: [[A3:%.*]] = phi ptr [ [[A1]], [[B0:%.*]] ]
+; NO-NEXT: br label [[B3:%.*]]
+; NO: b2:
+; NO-NEXT: [[A4:%.*]] = phi ptr [ [[A2]], [[B0]] ]
+; NO-NEXT: br label [[B3]]
+; NO: b3:
+; NO-NEXT: [[A5:%.*]] = phi ptr [ [[A3]], [[B1]] ], [ [[A4]], [[B2]] ]
+; NO-NEXT: [[G:%.*]] = getelementptr i8, ptr [[A5]], i32 [[I]]
+; NO-NEXT: store i8 1, ptr [[G]], align 1
+; NO-NEXT: store i32 0, ptr [[A5]], align 4
+; NO-NEXT: call void @use(ptr [[A5]])
+; NO-NEXT: ret void
+;
+; YES-LABEL: define void @f(
+; YES-SAME: i32 [[I:%.*]], i1 [[C:%.*]]) {
+; YES-NEXT: b0:
+; YES-NEXT: [[A1:%.*]] = alloca i32, align 4
+; YES-NEXT: [[A2:%.*]] = alloca i32, align 4
+; YES-NEXT: br i1 [[C]], label [[B1:%.*]], label [[B2:%.*]]
+; YES: b1:
+; YES-NEXT: [[A3:%.*]] = phi ptr [ [[A1]], [[B0:%.*]] ]
+; YES-NEXT: br label [[B3:%.*]]
+; YES: b2:
+; YES-NEXT: [[A4:%.*]] = phi ptr [ [[A2]], [[B0]] ]
+; YES-NEXT: br label [[B3]]
+; YES: b3:
+; YES-NEXT: [[A5:%.*]] = phi ptr [ [[A3]], [[B1]] ], [ [[A4]], [[B2]] ]
+; YES-NEXT: store i32 0, ptr [[A5]], align 4
+; YES-NEXT: call void @use(ptr [[A5]])
+; YES-NEXT: ret void
+;
+b0:
+ %a1 = alloca i32
+ %a2 = alloca i32
+ br i1 %c, label %b1, label %b2
+b1:
+ %a3 = phi ptr [ %a1, %b0 ]
+ br label %b3
+b2:
+ %a4 = phi ptr [ %a2, %b0 ]
+ br label %b3
+b3:
+ %a5 = phi ptr [ %a3, %b1 ], [ %a4, %b2 ]
+ %g = getelementptr i8, ptr %a5, i32 %i
+ store i8 1, ptr %g
+ store i32 0, ptr %a5
+ call void @use(ptr %a5)
+ ret void
+}
|
"object-size-offset-visitor-max-recurse-depth", | ||
cl::desc( | ||
"Maximum number of PHIs for ObjectSizeOffsetVisitor to look through"), | ||
cl::init(100)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should either not be a depth (but total visited count) or be a lot smaller. Recursing 100 levels allows for a lot of fan-out. Especially as this is only counting the phis and not other instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lowered to 20.
I think it's unlikely that we hit huge fanout in real world code, it's typically pretty linear at least with large autogenerated code. In the case I was looking at, this dropped DSE time from >30s to <1s.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's unlikely that we hit huge fanout in real world code, it's typically pretty linear at least with large autogenerated code.
I guess the fact that values don't get revisited does avoid large fan-out in practice.
In the case I was looking at, this dropped DSE time from >30s to <1s.
Huh, that sounds like a bug. We usually don't query objectsize on non-root objects, and DSE probably shouldn't do that either.
Somewhat expensive objectsize analysis is okay for actual llvm.objectsize intrinsics, but not for generic passes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've adjusted the call in DSE to check for an identified object first: 7aab12e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed with having a recursion depth limit. Where to put the limit may need adjusting, but this looks good now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is also recursion (though less likely) when visiting loads. I think it would be more robust (and a bit simpler) to instead do this depth inc/dec around the visit() call in
SizeOffsetType Res = visit(*I); |
needed to create a new helper function since we need to track InstructionsVisited per call to compute()
if (Instruction *I = dyn_cast<Instruction>(V)) { | ||
// If we have already seen this instruction, bail out. Cycles can happen in | ||
// unreachable code after constant propagation. | ||
auto P = SeenInsts.try_emplace(I, unknown()); | ||
if (!P.second) | ||
return P.first->second; | ||
++InstructionsVisited; | ||
if (InstructionsVisited > ObjectSizeOffsetVisitorMaxVisitInstructions) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could check SeenInsts.count()
instead of a separate variable here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SeenInsts
is a cache that can carry over between calls to ObjectSizeOffsetVisitor::compute()
, but I think we want ObjectSizeOffsetVisitorMaxVisitInstructions
to apply per call to ObjectSizeOffsetVisitor::compute()
. This does lead us to a weird situation where successive calls to compute()
will return different answers since we may explore more from cached results, but I think that's better than if somebody reuses a ObjectSizeOffsetVisitor
and later calls to it all return unknown()
since previous calls filled up the cache to ObjectSizeOffsetVisitorMaxVisitInstructions
size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I was not aware that ObjectSizeOffsetVisitor is designed for reuse, but apparently AddressSanitizer does use it that way.
Possibly we should be clearing SeenInsts for each call as well...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (Instruction *I = dyn_cast<Instruction>(V)) { | ||
// If we have already seen this instruction, bail out. Cycles can happen in | ||
// unreachable code after constant propagation. | ||
auto P = SeenInsts.try_emplace(I, unknown()); | ||
if (!P.second) | ||
return P.first->second; | ||
++InstructionsVisited; | ||
if (InstructionsVisited > ObjectSizeOffsetVisitorMaxVisitInstructions) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I was not aware that ObjectSizeOffsetVisitor is designed for reuse, but apparently AddressSanitizer does use it that way.
Possibly we should be clearing SeenInsts for each call as well...
This was merged after a different change caused the test to fail in the meantime.
With 7aab12e, the test is no longer relevant, but the patch is still good to have.
Local branch amd-gfx a3e6510 Merged main:08136d822c73 into amd-gfx:5c4b01290188 Remote branch main 339fc5e [test] Mark test added in llvm#67479 as XFAIL
…67479) We're running into stack overflows for huge functions with lots of phis. Even without the stack overflows, this is recursing >7000 in some auto-generated code. This fixes the stack overflow and brings down the compile time to something reasonable.
This was merged after a different change caused the test to fail in the meantime.
With 7aab12e, the test is no longer relevant, but the patch is still good to have.
We're running into stack overflows for huge functions with lots of phis. Even without the stack overflows, this is recursing >7000 in some auto-generated code.
This fixes the stack overflow and brings down the compile time to something reasonable.