From cbce2f02e9df04a1f7f292c4a2b8b50321f14fae Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 23 Jan 2018 23:05:04 +0000 Subject: [PATCH] [TableGen] Optimize the regex search. llvm::Regex is still the slowest regex engine on earth, running it over all instructions on X86 takes a while. Extract a prefix and use a binary search to reduce the search space before we resort to regex matching. There are a couple of caveats here: - The generic opcodes are outside of the sorted enum. They're handled in an extra loop. - If there's a top-level bar we can't use the prefix trick. - We bail on top-level ?. This could be handled, but it's rare. This brings the time to generate X86GenInstrInfo.inc from 21s to 4.7s on my machine. llvm-svn: 323277 --- llvm/utils/TableGen/CodeGenSchedule.cpp | 96 +++++++++++++++++++------ 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp index b753e19a54435..93409104313ec 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -12,17 +12,18 @@ // //===----------------------------------------------------------------------===// -#include "CodeGenInstruction.h" #include "CodeGenSchedule.h" +#include "CodeGenInstruction.h" #include "CodeGenTarget.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include #include @@ -50,36 +51,91 @@ struct InstrsOp : public SetTheory::Operator { }; // (instregex "OpcPat",...) Find all instructions matching an opcode pattern. -// -// TODO: Since this is a prefix match, perform a binary search over the -// instruction names using lower_bound. Note that the predefined instrs must be -// scanned linearly first. However, this is only safe if the regex pattern has -// no top-level bars. The DAG already has a list of patterns, so there's no -// reason to use top-level bars, but we need a way to verify they don't exist -// before implementing the optimization. struct InstRegexOp : public SetTheory::Operator { const CodeGenTarget &Target; InstRegexOp(const CodeGenTarget &t): Target(t) {} + /// Remove any text inside of parentheses from S. + static std::string removeParens(llvm::StringRef S) { + std::string Result; + unsigned Paren = 0; + // NB: We don't care about escaped parens here. + for (char C : S) { + switch (C) { + case '(': + ++Paren; + break; + case ')': + --Paren; + break; + default: + if (Paren == 0) + Result += C; + } + } + return Result; + } + void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { - SmallVector RegexList; + SmallVector>, 4> RegexList; for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) { StringInit *SI = dyn_cast(Arg); if (!SI) - PrintFatalError(Loc, "instregex requires pattern string: " - + Expr->getAsString()); - std::string pat = SI->getValue(); - // Implement a python-style prefix match. + PrintFatalError(Loc, "instregex requires pattern string: " + + Expr->getAsString()); + // Extract a prefix that we can binary search on. + static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; + auto FirstMeta = SI->getValue().find_first_of(RegexMetachars); + // Look for top-level | or ?. We cannot optimize them to binary search. + if (removeParens(SI->getValue()).find_first_of("|?") != std::string::npos) + FirstMeta = 0; + StringRef Prefix = SI->getValue().substr(0, FirstMeta); + std::string pat = SI->getValue().substr(FirstMeta); + if (pat.empty()) { + RegexList.push_back(std::make_pair(Prefix, None)); + continue; + } + // For the rest use a python-style prefix match. if (pat[0] != '^') { pat.insert(0, "^("); pat.insert(pat.end(), ')'); } - RegexList.push_back(Regex(pat)); - } - for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) { - for (auto &R : RegexList) { - if (R.match(Inst->TheDef->getName())) + RegexList.push_back(std::make_pair(Prefix, Regex(pat))); + } + for (auto &R : RegexList) { + // The generic opcodes are unsorted, handle them manually. + for (auto *Inst : Target.getInstructionsByEnumValue().slice( + 0, TargetOpcode::GENERIC_OP_END + 1)) { + if (Inst->TheDef->getName().startswith(R.first) && + (!R.second || + R.second->match(Inst->TheDef->getName().substr(R.first.size())))) + Elts.insert(Inst->TheDef); + } + + ArrayRef Instructions = + Target.getInstructionsByEnumValue().slice( + TargetOpcode::GENERIC_OP_END + 1); + + // Target instructions are sorted. Find the range that starts with our + // prefix. + struct Comp { + bool operator()(const CodeGenInstruction *LHS, StringRef RHS) { + return LHS->TheDef->getName() < RHS; + } + bool operator()(StringRef LHS, const CodeGenInstruction *RHS) { + return LHS < RHS->TheDef->getName() && + !RHS->TheDef->getName().startswith(LHS); + } + }; + auto Range = std::equal_range(Instructions.begin(), Instructions.end(), + R.first, Comp()); + + // For this range we know that it starts with the prefix. Check if there's + // a regex that needs to be checked. + for (auto *Inst : make_range(Range)) { + if (!R.second || + R.second->match(Inst->TheDef->getName().substr(R.first.size()))) Elts.insert(Inst->TheDef); } }