Skip to content

Commit

Permalink
[TableGen] Optimize the regex search.
Browse files Browse the repository at this point in the history
llvm::Regex is still the slowest regex engine on earth, running it over
all instructions on X86 takes a while. Extract a prefix and use a binary
search to reduce the search space before we resort to regex matching.

There are a couple of caveats here:
- The generic opcodes are outside of the sorted enum. They're handled in an extra loop.
- If there's a top-level bar we can't use the prefix trick.
- We bail on top-level ?. This could be handled, but it's rare.

This brings the time to generate X86GenInstrInfo.inc from 21s to 4.7s on
my machine.

llvm-svn: 323277
  • Loading branch information
d0k committed Jan 23, 2018
1 parent 0627a33 commit cbce2f0
Showing 1 changed file with 76 additions and 20 deletions.
96 changes: 76 additions & 20 deletions llvm/utils/TableGen/CodeGenSchedule.cpp
Expand Up @@ -12,17 +12,18 @@
//
//===----------------------------------------------------------------------===//

#include "CodeGenInstruction.h"
#include "CodeGenSchedule.h"
#include "CodeGenInstruction.h"
#include "CodeGenTarget.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Regex.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/Error.h"
#include <algorithm>
#include <iterator>
Expand Down Expand Up @@ -50,36 +51,91 @@ struct InstrsOp : public SetTheory::Operator {
};

// (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
//
// TODO: Since this is a prefix match, perform a binary search over the
// instruction names using lower_bound. Note that the predefined instrs must be
// scanned linearly first. However, this is only safe if the regex pattern has
// no top-level bars. The DAG already has a list of patterns, so there's no
// reason to use top-level bars, but we need a way to verify they don't exist
// before implementing the optimization.
struct InstRegexOp : public SetTheory::Operator {
const CodeGenTarget &Target;
InstRegexOp(const CodeGenTarget &t): Target(t) {}

/// Remove any text inside of parentheses from S.
static std::string removeParens(llvm::StringRef S) {
std::string Result;
unsigned Paren = 0;
// NB: We don't care about escaped parens here.
for (char C : S) {
switch (C) {
case '(':
++Paren;
break;
case ')':
--Paren;
break;
default:
if (Paren == 0)
Result += C;
}
}
return Result;
}

void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
ArrayRef<SMLoc> Loc) override {
SmallVector<Regex, 4> RegexList;
SmallVector<std::pair<StringRef, Optional<Regex>>, 4> RegexList;
for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) {
StringInit *SI = dyn_cast<StringInit>(Arg);
if (!SI)
PrintFatalError(Loc, "instregex requires pattern string: "
+ Expr->getAsString());
std::string pat = SI->getValue();
// Implement a python-style prefix match.
PrintFatalError(Loc, "instregex requires pattern string: " +
Expr->getAsString());
// Extract a prefix that we can binary search on.
static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
auto FirstMeta = SI->getValue().find_first_of(RegexMetachars);
// Look for top-level | or ?. We cannot optimize them to binary search.
if (removeParens(SI->getValue()).find_first_of("|?") != std::string::npos)
FirstMeta = 0;
StringRef Prefix = SI->getValue().substr(0, FirstMeta);
std::string pat = SI->getValue().substr(FirstMeta);
if (pat.empty()) {
RegexList.push_back(std::make_pair(Prefix, None));
continue;
}
// For the rest use a python-style prefix match.
if (pat[0] != '^') {
pat.insert(0, "^(");
pat.insert(pat.end(), ')');
}
RegexList.push_back(Regex(pat));
}
for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
for (auto &R : RegexList) {
if (R.match(Inst->TheDef->getName()))
RegexList.push_back(std::make_pair(Prefix, Regex(pat)));
}
for (auto &R : RegexList) {
// The generic opcodes are unsorted, handle them manually.
for (auto *Inst : Target.getInstructionsByEnumValue().slice(
0, TargetOpcode::GENERIC_OP_END + 1)) {
if (Inst->TheDef->getName().startswith(R.first) &&
(!R.second ||
R.second->match(Inst->TheDef->getName().substr(R.first.size()))))
Elts.insert(Inst->TheDef);
}

ArrayRef<const CodeGenInstruction *> Instructions =
Target.getInstructionsByEnumValue().slice(
TargetOpcode::GENERIC_OP_END + 1);

// Target instructions are sorted. Find the range that starts with our
// prefix.
struct Comp {
bool operator()(const CodeGenInstruction *LHS, StringRef RHS) {
return LHS->TheDef->getName() < RHS;
}
bool operator()(StringRef LHS, const CodeGenInstruction *RHS) {
return LHS < RHS->TheDef->getName() &&
!RHS->TheDef->getName().startswith(LHS);
}
};
auto Range = std::equal_range(Instructions.begin(), Instructions.end(),
R.first, Comp());

// For this range we know that it starts with the prefix. Check if there's
// a regex that needs to be checked.
for (auto *Inst : make_range(Range)) {
if (!R.second ||
R.second->match(Inst->TheDef->getName().substr(R.first.size())))
Elts.insert(Inst->TheDef);
}
}
Expand Down

0 comments on commit cbce2f0

Please sign in to comment.