Skip to content

Commit

Permalink
Support escaping in TrigramIndex.
Browse files Browse the repository at this point in the history
Summary:
This is a follow up to r288303, where I have introduced TrigramIndex
to speed up SpecialCaseList for the cases when all rules are
simple wildcards, like *hello*wor.d*.

Here, I add support for escaping, so that it's possible to
specify rules like *c\+\+abi*.

Reviewers: pcc

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D27318

llvm-svn: 288553
  • Loading branch information
Ivan Krasin committed Dec 2, 2016
1 parent cb3ef15 commit 75453b0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 14 deletions.
37 changes: 25 additions & 12 deletions llvm/lib/Support/TrigramIndex.cpp
Expand Up @@ -26,28 +26,41 @@ using namespace llvm;

static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";

static bool isSimpleWildcard(StringRef Str) {
// Check for regex metacharacters other than '*' and '.'.
return Str.find_first_of(RegexAdvancedMetachars) == StringRef::npos;
static bool isAdvancedMetachar(unsigned Char) {
return strchr(RegexAdvancedMetachars, Char) != nullptr;
}

void TrigramIndex::insert(std::string Regex) {
if (Defeated) return;
if (!isSimpleWildcard(Regex)) {
Defeated = true;
return;
}

std::set<unsigned> Was;
unsigned Cnt = 0;
unsigned Tri = 0;
unsigned Len = 0;
bool Escaped = false;
for (unsigned Char : Regex) {
if (Char == '.' || Char == '*') {
Tri = 0;
Len = 0;
continue;
if (!Escaped) {
// Regular expressions allow escaping symbols by preceding it with '\'.
if (Char == '\\') {
Escaped = true;
continue;
}
if (isAdvancedMetachar(Char)) {
// This is a more complicated regex than we can handle here.
Defeated = true;
return;
}
if (Char == '.' || Char == '*') {
Tri = 0;
Len = 0;
continue;
}
}
if (Escaped && Char >= '1' && Char <= '9') {
Defeated = true;
return;
}
// We have already handled escaping and can reset the flag.
Escaped = false;
Tri = ((Tri << 8) + Char) & 0xFFFFFF;
Len++;
if (Len < 3)
Expand Down
11 changes: 11 additions & 0 deletions llvm/unittests/Support/SpecialCaseListTest.cpp
Expand Up @@ -178,4 +178,15 @@ TEST_F(SpecialCaseListTest, PopularTrigram) {
EXPECT_TRUE(SCL->inSection("fun", "aaaabbbaaa"));
}

TEST_F(SpecialCaseListTest, EscapedSymbols) {
std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n"
"src:*hello\\\\world*\n");
EXPECT_TRUE(SCL->inSection("src", "dir/c++abi"));
EXPECT_FALSE(SCL->inSection("src", "dir/c\\+\\+abi"));
EXPECT_FALSE(SCL->inSection("src", "c\\+\\+abi"));
EXPECT_TRUE(SCL->inSection("src", "C:\\hello\\world"));
EXPECT_TRUE(SCL->inSection("src", "hello\\world"));
EXPECT_FALSE(SCL->inSection("src", "hello\\\\world"));
}

}
24 changes: 22 additions & 2 deletions llvm/unittests/Support/TrigramIndexTest.cpp
Expand Up @@ -94,9 +94,29 @@ TEST_F(TrigramIndexTest, TooComplicatedRegex2) {
EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, SpecialSymbol) {
TEST_F(TrigramIndexTest, EscapedSymbols) {
std::unique_ptr<TrigramIndex> TI =
makeTrigramIndex({"*c\\+\\+*"});
makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"});
EXPECT_FALSE(TI->isDefeated());
EXPECT_FALSE(TI->isDefinitelyOut("c++"));
EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+"));
EXPECT_FALSE(TI->isDefinitelyOut("hello\\world"));
EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world"));
EXPECT_FALSE(TI->isDefinitelyOut("atb"));
EXPECT_TRUE(TI->isDefinitelyOut("a\\tb"));
EXPECT_TRUE(TI->isDefinitelyOut("a\tb"));
EXPECT_FALSE(TI->isDefinitelyOut("a0b"));
}

TEST_F(TrigramIndexTest, Backreference1) {
std::unique_ptr<TrigramIndex> TI =
makeTrigramIndex({"*foo\\1*"});
EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, Backreference2) {
std::unique_ptr<TrigramIndex> TI =
makeTrigramIndex({"*foo\\2*"});
EXPECT_TRUE(TI->isDefeated());
}

Expand Down

0 comments on commit 75453b0

Please sign in to comment.