New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang] Optimize castToDeclContext for 2% improvement in build times #76825
[clang] Optimize castToDeclContext for 2% improvement in build times #76825
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-clang Author: Pol M (Destroyerrrocket) ChangesFull diff: https://github.com/llvm/llvm-project/pull/76825.diff 5 Files Affected:
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 432293583576b5..984a4d8bab5e77 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2044,6 +2044,14 @@ class RequiresExprBodyDecl : public Decl, public DeclContext {
// Implement isa/cast/dyncast/etc.
static bool classof(const Decl *D) { return classofKind(D->getKind()); }
static bool classofKind(Kind K) { return K == RequiresExprBody; }
+
+ static DeclContext *castToDeclContext(const RequiresExprBodyDecl *D) {
+ return static_cast<DeclContext *>(const_cast<RequiresExprBodyDecl *>(D));
+ }
+
+ static RequiresExprBodyDecl *castFromDeclContext(const DeclContext *DC) {
+ return static_cast<RequiresExprBodyDecl *>(const_cast<DeclContext *>(DC));
+ }
};
/// Represents a static or instance method of a struct/union/class.
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 5e03f0223d311c..b1733c2d052a65 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -930,20 +930,14 @@ const AttrVec &Decl::getAttrs() const {
Decl *Decl::castFromDeclContext (const DeclContext *D) {
Decl::Kind DK = D->getDeclKind();
- switch(DK) {
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT(NAME) \
- case Decl::NAME: \
- return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
-#define DECL_CONTEXT_BASE(NAME)
-#include "clang/AST/DeclNodes.inc"
- default:
+ switch (DK) {
#define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME) \
- if (DK >= first##NAME && DK <= last##NAME) \
- return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
+#define DECL_CONTEXT(NAME) \
+ case Decl::NAME: \
+ return static_cast<NAME##Decl *>(const_cast<DeclContext *>(D));
#include "clang/AST/DeclNodes.inc"
- llvm_unreachable("a decl that inherits DeclContext isn't handled");
+ default:
+ llvm_unreachable("a decl that inherits DeclContext isn't handled");
}
}
@@ -951,18 +945,12 @@ DeclContext *Decl::castToDeclContext(const Decl *D) {
Decl::Kind DK = D->getKind();
switch(DK) {
#define DECL(NAME, BASE)
-#define DECL_CONTEXT(NAME) \
- case Decl::NAME: \
- return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
-#define DECL_CONTEXT_BASE(NAME)
+#define DECL_CONTEXT(NAME) \
+ case Decl::NAME: \
+ return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
#include "clang/AST/DeclNodes.inc"
- default:
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME) \
- if (DK >= first##NAME && DK <= last##NAME) \
- return static_cast<NAME##Decl *>(const_cast<Decl *>(D));
-#include "clang/AST/DeclNodes.inc"
- llvm_unreachable("a decl that inherits DeclContext isn't handled");
+ default:
+ llvm_unreachable("a decl that inherits DeclContext isn't handled");
}
}
@@ -1129,20 +1117,14 @@ DeclContext::DeclContext(Decl::Kind K) {
}
bool DeclContext::classof(const Decl *D) {
- switch (D->getKind()) {
+ Decl::Kind DK = D->getKind();
+ switch (DK) {
#define DECL(NAME, BASE)
#define DECL_CONTEXT(NAME) case Decl::NAME:
-#define DECL_CONTEXT_BASE(NAME)
#include "clang/AST/DeclNodes.inc"
- return true;
- default:
-#define DECL(NAME, BASE)
-#define DECL_CONTEXT_BASE(NAME) \
- if (D->getKind() >= Decl::first##NAME && \
- D->getKind() <= Decl::last##NAME) \
- return true;
-#include "clang/AST/DeclNodes.inc"
- return false;
+ return true;
+ default:
+ return false;
}
}
diff --git a/clang/utils/TableGen/ClangASTNodesEmitter.cpp b/clang/utils/TableGen/ClangASTNodesEmitter.cpp
index 16a1c74b9d91ad..63220aad50aecd 100644
--- a/clang/utils/TableGen/ClangASTNodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangASTNodesEmitter.cpp
@@ -33,6 +33,7 @@ class ClangASTNodesEmitter {
typedef std::multimap<ASTNode, ASTNode> ChildMap;
typedef ChildMap::const_iterator ChildIterator;
+ std::set<ASTNode> PrioritizedClasses;
RecordKeeper &Records;
ASTNode Root;
const std::string &NodeClassName;
@@ -70,8 +71,15 @@ class ClangASTNodesEmitter {
std::pair<ASTNode, ASTNode> EmitNode(raw_ostream& OS, ASTNode Base);
public:
explicit ClangASTNodesEmitter(RecordKeeper &R, const std::string &N,
- const std::string &S)
- : Records(R), NodeClassName(N), BaseSuffix(S) {}
+ const std::string &S,
+ const std::string &PriorizeIfSubclassOf)
+ : Records(R), NodeClassName(N), BaseSuffix(S) {
+ auto vecPriorized = PriorizeIfSubclassOf.empty()
+ ? std::vector<Record *>{}
+ : R.getAllDerivedDefinitions(PriorizeIfSubclassOf);
+ PrioritizedClasses =
+ std::set<ASTNode>(vecPriorized.begin(), vecPriorized.end());
+ }
// run - Output the .inc file contents
void run(raw_ostream &OS);
@@ -95,8 +103,31 @@ std::pair<ASTNode, ASTNode> ClangASTNodesEmitter::EmitNode(raw_ostream &OS,
if (!Base.isAbstract())
First = Last = Base;
+ class Comp {
+ std::set<ASTNode> &PrioritizedClasses;
+
+ public:
+ Comp(std::set<ASTNode> &PrioritizedClasses)
+ : PrioritizedClasses(PrioritizedClasses) {}
+ bool operator()(ASTNode LHS, ASTNode RHS) const {
+ auto LHSPriorized = PrioritizedClasses.count(LHS) > 0;
+ auto RHSPriorized = PrioritizedClasses.count(RHS) > 0;
+ if (LHSPriorized && !RHSPriorized)
+ return true;
+ if (!LHSPriorized && RHSPriorized)
+ return false;
+
+ return LHS.getName() > RHS.getName();
+ }
+ };
+
+ auto SortedChildren = std::set<ASTNode, Comp>(Comp(PrioritizedClasses));
+
for (; i != e; ++i) {
- ASTNode Child = i->second;
+ SortedChildren.insert(i->second);
+ }
+
+ for (const auto &Child : SortedChildren) {
bool Abstract = Child.isAbstract();
std::string NodeName = macroName(std::string(Child.getName()));
@@ -182,9 +213,9 @@ void ClangASTNodesEmitter::run(raw_ostream &OS) {
OS << "#endif\n\n";
OS << "#ifndef LAST_" << macroHierarchyName() << "_RANGE\n";
- OS << "# define LAST_"
- << macroHierarchyName() << "_RANGE(Base, First, Last) "
- << macroHierarchyName() << "_RANGE(Base, First, Last)\n";
+ OS << "# define LAST_" << macroHierarchyName()
+ << "_RANGE(Base, First, Last) " << macroHierarchyName()
+ << "_RANGE(Base, First, Last)\n";
OS << "#endif\n\n";
EmitNode(OS, Root);
@@ -196,8 +227,20 @@ void ClangASTNodesEmitter::run(raw_ostream &OS) {
}
void clang::EmitClangASTNodes(RecordKeeper &RK, raw_ostream &OS,
- const std::string &N, const std::string &S) {
- ClangASTNodesEmitter(RK, N, S).run(OS);
+ const std::string &N, const std::string &S,
+ const std::string &PriorizeIfSubclassOf) {
+ ClangASTNodesEmitter(RK, N, S, PriorizeIfSubclassOf).run(OS);
+}
+
+void printDeclContext(std::multimap<Record *, Record *> &Tree,
+ Record *DeclContext, raw_ostream &OS) {
+ if (!DeclContext->getValueAsBit(AbstractFieldName))
+ OS << "DECL_CONTEXT(" << DeclContext->getName() << ")\n";
+ auto i = Tree.lower_bound(DeclContext);
+ auto end = Tree.upper_bound(DeclContext);
+ for (; i != end; ++i) {
+ printDeclContext(Tree, i->second, OS);
+ }
}
// Emits and addendum to a .inc file to enumerate the clang declaration
@@ -210,38 +253,26 @@ void clang::EmitClangDeclContext(RecordKeeper &Records, raw_ostream &OS) {
OS << "#ifndef DECL_CONTEXT\n";
OS << "# define DECL_CONTEXT(DECL)\n";
OS << "#endif\n";
-
- OS << "#ifndef DECL_CONTEXT_BASE\n";
- OS << "# define DECL_CONTEXT_BASE(DECL) DECL_CONTEXT(DECL)\n";
- OS << "#endif\n";
-
- typedef std::set<Record*> RecordSet;
- typedef std::vector<Record*> RecordVector;
-
- RecordVector DeclContextsVector
- = Records.getAllDerivedDefinitions(DeclContextNodeClassName);
- RecordVector Decls = Records.getAllDerivedDefinitions(DeclNodeClassName);
- RecordSet DeclContexts (DeclContextsVector.begin(), DeclContextsVector.end());
-
- for (RecordVector::iterator i = Decls.begin(), e = Decls.end(); i != e; ++i) {
- Record *R = *i;
-
- if (Record *B = R->getValueAsOptionalDef(BaseFieldName)) {
- if (DeclContexts.find(B) != DeclContexts.end()) {
- OS << "DECL_CONTEXT_BASE(" << B->getName() << ")\n";
- DeclContexts.erase(B);
- }
- }
+
+ std::vector<Record *> DeclContextsVector =
+ Records.getAllDerivedDefinitions(DeclContextNodeClassName);
+ std::vector<Record *> Decls =
+ Records.getAllDerivedDefinitions(DeclNodeClassName);
+
+ std::multimap<Record *, Record *> Tree;
+
+ const std::vector<Record *> Stmts =
+ Records.getAllDerivedDefinitions(DeclNodeClassName);
+
+ for (unsigned i = 0, e = Stmts.size(); i != e; ++i) {
+ Record *R = Stmts[i];
+ if (auto *B = R->getValueAsOptionalDef(BaseFieldName))
+ Tree.insert(std::make_pair(B, R));
}
- // To keep identical order, RecordVector may be used
- // instead of RecordSet.
- for (RecordVector::iterator
- i = DeclContextsVector.begin(), e = DeclContextsVector.end();
- i != e; ++i)
- if (DeclContexts.find(*i) != DeclContexts.end())
- OS << "DECL_CONTEXT(" << (*i)->getName() << ")\n";
+ for (auto *DeclContext : DeclContextsVector) {
+ printDeclContext(Tree, DeclContext, OS);
+ }
OS << "#undef DECL_CONTEXT\n";
- OS << "#undef DECL_CONTEXT_BASE\n";
}
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index c1f2ca15b595c0..3859555d647fd1 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -398,7 +398,8 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
EmitClangASTNodes(Records, OS, CommentNodeClassName, "");
break;
case GenClangDeclNodes:
- EmitClangASTNodes(Records, OS, DeclNodeClassName, "Decl");
+ EmitClangASTNodes(Records, OS, DeclNodeClassName, "Decl",
+ DeclContextNodeClassName);
EmitClangDeclContext(Records, OS);
break;
case GenClangStmtNodes:
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 35f2f04c1e818d..ae895698ce5468 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -26,7 +26,8 @@ namespace clang {
void EmitClangDeclContext(llvm::RecordKeeper &RK, llvm::raw_ostream &OS);
void EmitClangASTNodes(llvm::RecordKeeper &RK, llvm::raw_ostream &OS,
- const std::string &N, const std::string &S);
+ const std::string &N, const std::string &S,
+ const std::string &PriorizeIfSubclassOf = "");
void EmitClangBasicReader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitClangBasicWriter(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitClangTypeNodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
|
Yeah, I also saw that in O3 most of the compile time is overshadowed by LLVM, so it makes perfect sense that the -O0 are the ones showing better results! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for your PR, this does look like a nice performance improvement, I added some reviewers who might have more detailed comments.
Please add a detailed summary for this PR. The summary is what usually goes into the git log and it is important that the git log is useful for quick understanding of changes without having to dig into the details.
Copied description from the issue that tracks this PR (sorry, I'm away from my PC) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Core idea of removing DECL_CONTEXT_BASE
and range checks for DeclContext
nodes seems correct to me, as all this machinery wasn't intended for DeclContext
in the first place. This is apparent from TableGen definition, which lacks any base parameter:
class DeclContext {} |
But changes made to ClangASTNodesEmitter.cpp
are not as clear to me. I'd expect it to simply opt out DeclContext
nodes from some code generation, but instead I see a lot of changes concerned with PrioritizedClasses
. It would be nice if you explain what's going on in that file.
The reason for puting the classes with DeclContext closer together is to achieve a better code generation. Here's an example of the asssembly with just the removal of the DECL_CONTEXT_BASE macro: clang::Decl::castToDeclContext(clang::Decl const*): # @clang::Decl::castToDeclContext(clang::Decl const*)
.L_ZN5clang4Decl17castToDeclContextEPKS0_$local:
movl 28(%rdi), %edx
leaq .LJTI66_0(%rip), %rsi
movq %rdi, %rax
movl $40, %ecx
andl $127, %edx
decl %edx
movslq (%rsi,%rdx,4), %rdx
addq %rsi, %rdx
jmpq *%rdx
.LBB66_5:
movl $48, %ecx
addq %rcx, %rax
retq
.LBB66_1:
addq %rcx, %rax
retq
.LBB66_2:
movl $72, %ecx
addq %rcx, %rax
retq
.LBB66_6:
movl $64, %ecx
addq %rcx, %rax
retq
.LBB66_7:
movl $56, %ecx
addq %rcx, %rax
retq
.LBB66_8:
.LJTI66_0:
<jump table> While this is better, it is still not performing ideally compared to just the load and an add of the final implementation. I'm of course open to suggestions if there is a better way of getting llvm to emit the right thing. I agree with @cor3ntin that a comment explaining the need for this order priorization is needed. |
Addressed the first review comments! Thank you for your time, I appreciate it :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but you should wait for more approvals before merging.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
1% is a very consequential improvement, congrats!
Thank you! I truly appreciate the congrats! :D |
@Destroyerrrocket We'll merge as soon as you deem it ready and provide commit description. PR description doesn't seem a good fit for a commit description to me. |
5cade3d
to
bea08b9
Compare
Squashed, rebased, and updated the commit message to give more information. You can go ahead and merge! |
castToDeclContext is a heavily used function, and as such, it needs to be kept as slim as feasible to preserve as much performance as possible. To this end, it was observed that the function was generating suboptimal assembly code, and putting the most common execution path in the longest sequence of instructions. This patch addresses this by guiding the compiler towards generating a lookup table of offsets, which can be used to perform an addition on the pointer. This results in a 1-2% improvement on debug builds (and a negligible improvement on release). To achieve this, the switch was simplified to flatten the if statements in the default branch. In order to make the values of the switch more compact, encouraging LLVM to generate a look-up table instead of a jump table, the AST TableGen generator was modified so it can take order priority based on class inheritance. This combination allowed for a more optimal generation of the function. Of note, 2 other functions with an equivalent structure also needed to be modified. Please refer to the issue/PR for exact assembly results: llvm#76824 llvm#76825 Reviewers: Cor3ntin & Endilll
bea08b9
to
2afecb3
Compare
The changes to tablegen made by #76825 result in `StmtClass::lastStmtConstant` changing from `StmtClass::WhileStmtClass` to `StmtClass::GCCAsmStmtClass`. Since `CFG::BuildOptions::alwaysAdd` is never called with a `WhileStmt`, this has flown under the radar until now. Once such test in which an out-of-bounds access occurs is `test/Sema/inline-asm-validate.c`, among many others.
The changes to tablegen made by llvm#76825 result in `StmtClass::lastStmtConstant` changing from `StmtClass::WhileStmtClass` to `StmtClass::GCCAsmStmtClass`. Since `CFG::BuildOptions::alwaysAdd` is never called with a `WhileStmt`, this has flown under the radar until now. Once such test in which an out-of-bounds access occurs is `test/Sema/inline-asm-validate.c`, among many others.
The changes to tablegen made by llvm#76825 result in `StmtClass::lastStmtConstant` changing from `StmtClass::WhileStmtClass` to `StmtClass::GCCAsmStmtClass`. Since `CFG::BuildOptions::alwaysAdd` is never called with a `WhileStmt`, this has flown under the radar until now. Once such test in which an out-of-bounds access occurs is `test/Sema/inline-asm-validate.c`, among many others.
[copied from issue]
The function castToDeclContext takes around 2% of the execution time in my test run.
I was profiling clang to see if by any chance I could spot any small mistakes that were taking a significant amount of time on debug builds, and while pretty much everything is either complicated enough where optimizing would be impossible for me or is already very performant, I noticed that valgrind was reporting an interesting function as the top of the 'auto' category:
It is being executed 1.8 billion times, and the implementation looks pretty trivial to me:
Looking at the assembly, I expected pretty much a lookup table and an add operation, but it is pretty clear that the resulting code is not ideal:
The PR I'll submit in a few minutes fixes this problem by eliminating the need for the macro DECL_CONTEXT_BASE (it's only used here and in two other analogous functions), and reordering the AST decl order to prioritize classes that inherit from DeclContext. I also experimented with hand rolled offset tables, but this is far from maintainable even if it manages to compress 3 lookup tables into one. The resulting assembly is just:
And the build difference of clang+clang-tools-extra with a debug build:
NonOpt: ninja 19007,02s user 760,01s system 2284% cpu 14:25,23 total
Opt: ninja 18806,18s user 763,33s system 2308% cpu 14:07,74 total
So around ~1.02 speedup, ~0.98 of the previous execution, nothing earth shattering, but what would be expected from valgrind, and I already did all the legwork, so I might as well send it :)