Skip to content

Commit

Permalink
re2: do not edit Regexp during IsAnchorStart, IsAnchorEnd
Browse files Browse the repository at this point in the history
Must never edit existing regexps: they are logically immutable.
Was causing RE2::Regexp to give back the wrong answer.

R=r
CC=re2-dev
http://codereview.appspot.com/4626065
  • Loading branch information
rsc committed Jun 22, 2011
1 parent 5541bb1 commit 9815f8b
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 48 deletions.
122 changes: 80 additions & 42 deletions re2/compile.cc
Expand Up @@ -854,59 +854,97 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
// Is this regexp required to start at the beginning of the text?
// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
// but handles (\A(a|b)). Could use the Walker to write a more exact one.
static bool IsAnchorStart(Regexp** pre) {
for (;;) {
Regexp* re = *pre;
if (re == NULL)
return false;
switch (re->op()) {
default:
break;
case kRegexpConcat:
if (re->nsub() > 0) {
pre = &re->sub()[0];
continue;
static bool IsAnchorStart(Regexp** pre, int depth) {
Regexp* re = *pre;
Regexp* sub;
// The depth limit makes sure that we don't overflow
// the stack on a deeply nested regexp. As the comment
// above says, IsAnchorStart is conservative, so returning
// a false negative is okay. The exact limit is somewhat arbitrary.
if (re == NULL || depth >= 4)
return false;
switch (re->op()) {
default:
break;
case kRegexpConcat:
if (re->nsub() > 0) {
sub = re->sub()[0]->Incref();
if (IsAnchorStart(&sub, depth+1)) {
Regexp** subcopy = new Regexp*[re->nsub()];
subcopy[0] = sub; // already have reference
for (int i = 1; i < re->nsub(); i++)
subcopy[i] = re->sub()[i]->Incref();
*pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
delete[] subcopy;
re->Decref();
return true;
}
break;
case kRegexpCapture:
pre = &re->sub()[0];
continue;
case kRegexpBeginText:
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
sub->Decref();
}
break;
case kRegexpCapture:
sub = re->sub()[0]->Incref();
if (IsAnchorStart(&sub, depth+1)) {
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
re->Decref();
return true;
}
return false;
}
sub->Decref();
break;
case kRegexpBeginText:
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
re->Decref();
return true;
}
return false;
}

// Is this regexp required to start at the end of the text?
// Only approximate; can return false for complicated regexps like (a\z|b\z),
// but handles ((a|b)\z). Could use the Walker to write a more exact one.
static bool IsAnchorEnd(Regexp** pre) {
for (;;) {
Regexp* re = *pre;
if (re == NULL)
return false;
switch (re->op()) {
default:
break;
case kRegexpConcat:
if (re->nsub() > 0) {
pre = &re->sub()[re->nsub() - 1];
continue;
static bool IsAnchorEnd(Regexp** pre, int depth) {
Regexp* re = *pre;
Regexp* sub;
// The depth limit makes sure that we don't overflow
// the stack on a deeply nested regexp. As the comment
// above says, IsAnchorEnd is conservative, so returning
// a false negative is okay. The exact limit is somewhat arbitrary.
if (re == NULL || depth >= 4)
return false;
switch (re->op()) {
default:
break;
case kRegexpConcat:
if (re->nsub() > 0) {
sub = re->sub()[re->nsub() - 1]->Incref();
if (IsAnchorEnd(&sub, depth+1)) {
Regexp** subcopy = new Regexp*[re->nsub()];
subcopy[re->nsub() - 1] = sub; // already have reference
for (int i = 0; i < re->nsub() - 1; i++)
subcopy[i] = re->sub()[i]->Incref();
*pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
delete[] subcopy;
re->Decref();
return true;
}
break;
case kRegexpCapture:
pre = &re->sub()[0];
continue;
case kRegexpEndText:
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
sub->Decref();
}
break;
case kRegexpCapture:
sub = re->sub()[0]->Incref();
if (IsAnchorEnd(&sub, depth+1)) {
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
re->Decref();
return true;
}
return false;
}
sub->Decref();
break;
case kRegexpEndText:
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
re->Decref();
return true;
}
return false;
}

void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
Expand Down Expand Up @@ -963,8 +1001,8 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {

// Record whether prog is anchored, removing the anchors.
// (They get in the way of other optimizations.)
bool is_anchor_start = IsAnchorStart(&sre);
bool is_anchor_end = IsAnchorEnd(&sre);
bool is_anchor_start = IsAnchorStart(&sre, 0);
bool is_anchor_end = IsAnchorEnd(&sre, 0);

// Generate fragment for entire regexp.
Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
Expand Down
16 changes: 10 additions & 6 deletions re2/testing/re2_test.cc
Expand Up @@ -505,7 +505,6 @@ TEST(Capture, NamedGroups) {
}

TEST(RE2, FullMatchWithNoArgs) {
/***** FullMatch with no args *****/
CHECK(RE2::FullMatch("h", "h"));
CHECK(RE2::FullMatch("hello", "hello"));
CHECK(RE2::FullMatch("hello", "h.*o"));
Expand All @@ -514,8 +513,6 @@ TEST(RE2, FullMatchWithNoArgs) {
}

TEST(RE2, PartialMatch) {
/***** PartialMatch *****/

CHECK(RE2::PartialMatch("x", "x"));
CHECK(RE2::PartialMatch("hello", "h.*o"));
CHECK(RE2::PartialMatch("othello", "h.*o"));
Expand Down Expand Up @@ -1135,11 +1132,11 @@ TEST(RE2, DeepRecursion) {
// segmentation violation due to stack overflow before pcre was
// patched.
// Again, a PCRE legacy test. RE2 doesn't recurse.
string comment("/*");
string comment("x*");
string a(131072, 'a');
comment += a;
comment += "*/";
RE2 re("((?:\\s|//.*\n|/[*](?:\n|.)*?[*]/)*)");
comment += "*x";
RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
CHECK(RE2::FullMatch(comment, re));
}

Expand Down Expand Up @@ -1355,4 +1352,11 @@ TEST(RE2, CapturingGroupNames) {
EXPECT_EQ(want, have);
}

TEST(RE2, RegexpToStringLossOfAnchor) {
EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
}

} // namespace re2

0 comments on commit 9815f8b

Please sign in to comment.