Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

[STD] various changes in preparation for inlining p5 regex optimizations

git-svn-id: http://svn.pugscode.org/pugs@21901 c213334d-75ef-0310-aa23-eaa082d1ae64
  • Loading branch information...
commit c6c21e5fc270714a63f7021b9dcba85f6cb92871 1 parent 93d41de
lwall authored
Showing with 255 additions and 116 deletions.
  1. +1 −1  Cursor.pmc
  2. +40 −42 STD.pm
  3. +212 −72 gimme5
  4. +1 −0  mangle.pl
  5. +1 −1  tryfoo
View
2  Cursor.pmc
@@ -1439,7 +1439,7 @@ sub _PATTERN { my $self = shift;
my $P = $self->{_pos} // 0;
my $buf = $self->{_orig};
pos($$buf) = $P;
- if ($$buf =~ $qr) {
+ if ($$buf =~ /$qr/gc) {
my $len = $+[0] - $P;
$self->deb("PATTERN $qr matched @{[substr($$buf,$P,$len)]} at $P $len") if $DEBUG & DEBUG::matchers;
my $r = $self->cursor($P+$len);
View
82 STD.pm
@@ -9,7 +9,7 @@ my $PARSER is context<rw>;
my $IN_DECL is context<rw>;
# random rule for debugging, please ignore
-regex foo {
+token foo {
'foo' 'bar' 'baz'
}
@@ -153,6 +153,7 @@ constant %list_assignment = (:prec<i=>, :sub<e=>, :assoc<right>);
constant %list_prefix = (:prec<e=>);
constant %loose_and = (:prec<d=>, :assoc<left>, :assign);
constant %loose_or = (:prec<c=>, :assoc<left>, :assign);
+constant %feed_infix = (:prec<b=>, :assoc<left>);
constant %LOOSEST = (:prec<a=!>);
constant %terminator = (:prec<a=>, :assoc<list>);
@@ -256,6 +257,9 @@ class Loose_and does PrecOp {
class Loose_or does PrecOp {
our %o = %loose_or;
} # end class
+class Feed_infix does PrecOp {
+ our %o = %feed_infix;
+} # end class
class Terminator does PrecOp {
our %o = %terminator;
} # end class
@@ -418,7 +422,7 @@ token unsp {
token vws {
\v ::
- { $COMPILING::LINE++ } # XXX wrong several ways
+ { $COMPILING::LINE++ } # XXX wrong several ways, use self.lineof($¢.pos)
[ '#DEBUG -1' { say "DEBUG"; $STD::DEBUG = $*DEBUG = -1; } ]?
}
@@ -440,12 +444,16 @@ token unv {
]
}
-token identish {
+token identifier {
<.alpha> \w*
}
+token apostrophe {
+ <[ ' \- ]>
+}
+
token ident {
- <.identish> [<[ ' \- ]><identish>]*
+ <.identifier> [ <.apostrophe> <.identifier> ]*
}
# XXX We need to parse the pod eventually to support $= variables.
@@ -453,10 +461,10 @@ token ident {
token pod_comment {
^^ '=' <.unsp>?
[
- | 'begin' \h+ <ident> :: .*? \n
- '=' <.unsp>? 'end' \h+ $<ident> » \N* {*} #= tagged
- | 'begin' » :: \h* \n .*? \n
- '=' <.unsp>? 'end' » \N* {*} #= anon
+ | 'begin' \h+ <ident> :: .*?
+ "\n=" <.unsp>? 'end' \h+ $<ident> » \N* {*} #= tagged
+ | 'begin' » :: \h* \n .*?
+ "\n=" <.unsp>? 'end' » \N* {*} #= anon
| ::
[ <?before .*? ^^ '=cut' » > <.panic: "Obsolete pod format, please use =begin/=end instead"> ]?
\N* {*} #= misc
@@ -539,6 +547,7 @@ token regex_block {
rule statementlist {
:my $PARSER is context<rw> = self;
[
+ | $
| <?before <[\)\]\}]> >
| [<statement><.eat_terminator> ]*
]
@@ -569,7 +578,11 @@ token label {
token statement {
:my $endargs is context = -1;
<!before <[\)\]\}]> >
+
+ # this could either be a statement that follows a declaration
+ # or a statement that is within the block of a code declaration
<!!{ bless $¢, ref $PARSER; }>
+
[
| <label> <statement> {*} #= label
| <statement_control> {*} #= control
@@ -2994,7 +3007,7 @@ token infix:sym<::=> ( --> Item_assignment)
token infix:sym<.=> ( --> Item_assignment) {
<sym> <.ws>
- [ <?before \w+';' | < new sort subst trans > > || <worryobs('.= as append operator', '~=')> ]
+ [ <?before \w+';' | 'new' | 'sort' | 'subst' | 'trans' > || <worryobs('.= as append operator', '~=')> ]
{ $<O><nextterm> = 'dottyop' }
}
@@ -3096,27 +3109,6 @@ token term:name ( --> Term)
# unrecognized names are assumed to be post-declared listops.
|| <args>?
-# || <?before \s> <arglist>
-# {*} #= listop args
-# ||
-# [
-# | '.(' <in: ')', 'semilist', 'argument list'>
-# {*} #= func args
-#
-# | '(' <in: ')', 'semilist', 'argument list'>
-# {*} #= func args
-#
-# | <.unsp> '.'? '(' <in: ')', 'semilist', 'argument list'>
-# {*} #= func args
-#
-#
-# | :: {*} #= listop noarg
-# ]
-#
-# [
-# || ':' <?before \s> <arglist> # either switch to listopiness
-# || {{ $+prevop = $<O> = {}; }} # or allow adverbs
-# ]
]
}
@@ -3143,6 +3135,18 @@ token infix:sym<xor> ( --> Loose_or)
token infix:sym<orelse> ( --> Loose_or)
{ <sym> }
+token infix:sym« <== » ( --> Feed_infix)
+ { <sym> }
+
+token infix:sym« ==> » ( --> Feed_infix)
+ { <sym> {*} } #'
+
+token infix:sym« <<== » ( --> Feed_infix)
+ { <sym> }
+
+token infix:sym« ==>> » ( --> Feed_infix)
+ { <sym> {*} } #'
+
## expression terminator
token terminator:sym<;> ( --> Terminator)
@@ -3169,12 +3173,6 @@ token terminator:sym<given> ( --> Terminator)
token terminator:sym<when> ( --> Terminator)
{ <?before 'when' » > }
-token terminator:sym« <== » ( --> Terminator)
- { <?before '<==' > }
-
-token terminator:sym« ==> » ( --> Terminator)
- { <?before '==>' > {*} } #'
-
token terminator:sym« --> » ( --> Terminator)
{ <?before '-->' > {*} } #'
@@ -3724,13 +3722,13 @@ grammar Regex is STD {
# XXX will this please work somehow ???
token mod_internal:sym<:a( )> { $<sym>=[':a'|':ignoreaccent'] <mod_arg> { $+ignoreaccent = $<mod_arg>.eval } }
- token mod_internal:sym<:s> { <sym> 'igspace'? » { $+sigspace = 1 } }
- token mod_internal:sym<:!s> { <sym> 'igspace'? » { $+sigspace = 0 } }
- token mod_internal:sym<:s( )> { <sym> 'igspace'? <mod_arg> { $+sigspace = $<mod_arg>.eval } }
+ token mod_internal:sym<:s> { ':s' 'igspace'? » { $+sigspace = 1 } }
+ token mod_internal:sym<:!s> { ':s' 'igspace'? » { $+sigspace = 0 } }
+ token mod_internal:sym<:s( )> { ':s' 'igspace'? <mod_arg> { $+sigspace = $<mod_arg>.eval } }
- token mod_internal:sym<:r> { <sym> 'atchet'? » { $+ratchet = 1 } }
- token mod_internal:sym<:!r> { <sym> 'atchet'? » { $+ratchet = 0 } }
- token mod_internal:sym<:r( )> { <sym> 'atchet'? » <mod_arg> { $+ratchet = $<mod_arg>.eval } }
+ token mod_internal:sym<:r> { ':r' 'atchet'? » { $+ratchet = 1 } }
+ token mod_internal:sym<:!r> { ':r' 'atchet'? » { $+ratchet = 0 } }
+ token mod_internal:sym<:r( )> { ':r' 'atchet'? » <mod_arg> { $+ratchet = $<mod_arg>.eval } }
token mod_internal:adv {
<?before ':' <ident> > [ :lang($¢.cursor_fresh($+LANG)) <quotepair> ] { $/<sym> := «: $<quotepair><key>» }
View
284 gimme5
@@ -4,6 +4,11 @@ use 5.010;
use strict;
use warnings;
use Text::Balanced qw(extract_bracketed);
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+use Encode;
+use utf8;
my $failover = 0;
if (@ARGV) {
@@ -200,8 +205,13 @@ sub un6 {
}
{
- local $/;
- $_ = <>;
+ open(IN, $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
+ {
+ local $/;
+ binmode(IN, ':utf8');
+ $_ = <IN>;
+ }
+ close IN;
push @impure, m/^method (\w+)/mg;
@impure{@impure} = (1) x @impure;
#warn "@impure\n";
@@ -447,9 +457,9 @@ END
warn "ARGSTUFF: ", $argstuff if $argstuff =~ /\S/;
my $p = "";
- local $MAYBACKTRACK = 1; # XXX ratchet current broken
- if ($KIND eq 'regex') {
- $MAYBACKTRACK = 1;
+ local $MAYBACKTRACK = 1;
+ if ($KIND eq 'token' or $KIND eq 'rule') {
+ $MAYBACKTRACK = 0;
}
if ($args =~ s/ *--> *(\w*) *$//) {
@@ -879,6 +889,7 @@ our $DELIM;
our %INSTANTIATED;
require 'mangle.pl';
+use utf8;
END
@@ -888,7 +899,7 @@ END
sub dumpretree {
if (%$RETREE) {
$out .= "BEGIN {\n \$retree = YAML::XS::Load(<<'RETREE_END');\n";
- $out .= Dump($RETREE);
+ $out .= Encode::decode("utf8", Dump($RETREE));
$out .= "RETREE_END\n}\n";
}
}
@@ -904,12 +915,14 @@ sub here {
sub ws {
return if $KIND eq 'rule'; # meta whitespace parsed in atom
for (;;) {
- next if s/^\s+//;
- next if s/^#\(.*?\)//s;
- next if s/^#\{.*?\}//s;
- next if s/^#\[.*?\]//s;
- next if s/^#\<.*?\>//s;
- next if s/^#.*\n//;
+# next if s/^(?!=[\0-~])\s+//;
+ next if s/^[\x20\t\n\r]+//;
+ last unless s/^#//;
+ next if s/^\(.*?\)//s;
+ next if s/^\{.*?\}//s;
+ next if s/^\[.*?\]//s;
+ next if s/^\<.*?\>//s;
+ next if s/^.*\n//;
last;
}
}
@@ -1650,10 +1663,10 @@ sub unbalanced {
$text =~ s/\.\./-/g;
$text =~ s/^-\[/[^/;
if ($$self{i}) {
- $self->bind("\$C->_CCLASS$REV(qr/^(?i)$text\$/)");
+ $self->bind("\$C->_PATTERN$REV(qr/\\G(?i:$text)/)");
}
else {
- $self->bind("\$C->_CCLASS$REV(qr/^$text\$/)");
+ $self->bind("\$C->_PATTERN$REV(qr/\\G$text/)");
}
}
}
@@ -1672,11 +1685,12 @@ sub unbalanced {
sub walk {
my $self = shift;
my $text = $$self{text};
+ $text = "(?<=$text)" if $REV;
if ($$self{i}) {
- $self->bind('$C->_PATTERN' . $REV . '(qr/(?i)\\Q' . $text . '\\E/")');
+ $self->bind('$C->_PATTERN(qr/\\G(?i:' . $text . ')/")');
}
else {
- $self->bind('$C->_EXACT' . $REV . '("' . $text . '")');
+ $self->bind('$C->_PATTERN(qr/\\G' . $text . '/)');
}
}
}
@@ -1684,13 +1698,15 @@ sub unbalanced {
{ package RE_string; use base "REbase";
sub walk {
my $self = shift;
- my $text = $$self{text};
- $text =~ s/(['\\])/\\$1/g;
+ my $text = quotemeta($$self{text});
+ $text = "(?<=$text)" if $REV;
if ($$self{i}) {
- '$C->_PATTERN' . $REV . '(qr/(?i)\\Q' . $text . '\\E/")';
+ '$C->_PATTERN(qr/\\G(?i:' . $text . ')/)';
}
else {
- "\$C->_EXACT$REV('" . $text . "')";
+ "\$C->_PATTERN(qr/\\G$text/)";
+# my $l = length($text);
+# "(substr(\$\$buf, \$C->{_pos}, $l) eq '" . $text . "' ? \$C->cursor(\$C->{_pos} + $l) : ())"
}
}
}
@@ -1706,25 +1722,47 @@ sub unbalanced {
$not = 1;
}
if ($text eq '.') {
- $code = "\$C->_ANY$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=(?s:.))/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G(?s:.)/)";
+ }
+# $code = "\$C->_ANY$REV()";
}
elsif ($text eq '.*') {
+# if ($REV) {
+# $code = "\$C->_PATTERN(qr/\\G(?<=.*)/)";
+# }
+# else {
+# $code = "\$C->_PATTERN(qr/\\G.*/)";
+# }
$code = "\$C->_SCANg$REV()";
}
elsif ($text eq '.*?') {
+# if ($REV) {
+# $code = "\$C->_PATTERN(qr/\\G(?<=.*?)/)";
+# }
+# else {
+# $code = "\$C->_PATTERN(qr/\\G.*?/)";
+# }
$code = "\$C->_SCANf$REV()";
}
elsif ($text eq '^') {
- $code = "\$C->_BOS$REV()";
+ $code = "\$C->_PATTERN(qr/\\G\\A/)";
+# $code = "\$C->_BOS$REV()";
}
elsif ($text eq '^^') {
- $code = "\$C->_BOL$REV()";
+ $code = "\$C->_PATTERN(qr/\\G(?m:^)/)";
+# $code = "\$C->_BOL$REV()";
}
elsif ($text eq '$') {
- $code = "\$C->_EOS$REV()";
+ $code = "\$C->_PATTERN(qr/\\G\\z/)";
+# $code = "\$C->_EOS$REV()";
}
elsif ($text eq '$$') {
- $code = "\$C->_EOL$REV()";
+ $code = "\$C->_PATTERN(qr/\\G(?m:\$)/)";
+# $code = "\$C->_EOL$REV()";
}
elsif ($text eq ':') {
$code = "\$C->_COMMITATOM$REV()";
@@ -1732,31 +1770,65 @@ sub unbalanced {
elsif ($text eq '::') {
$PURE = 0;
$code = "\$C->_COMMITBRANCH$REV()";
+ $MAYBACKTRACK = 1;
}
elsif ($text eq ':::') {
$PURE = 0;
$code = "\$C->_COMMITRULE$REV()";
+ $MAYBACKTRACK = 1;
}
elsif ($text eq '\\d') {
- $code = "\$C->_DIGIT$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=\\d)/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G\\d/)";
+ }
+# $code = "\$C->_DIGIT$REV()";
}
elsif ($text eq '\\w') {
- $code = "\$C->_ALNUM$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=\\w)/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G\\w/)";
+ }
+# $code = "\$C->_ALNUM$REV()";
}
elsif ($text eq '\\s') {
- $code = "\$C->_SPACE$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=\\s)/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G\\s/)";
+ }
+# $code = "\$C->_SPACE$REV()";
}
elsif ($text eq '\\h') {
- $code = "\$C->_HSPACE$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=[\\x20\\t\\r])/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G[\\x20\\t\\r]/)";
+ }
+# $code = "\$C->_HSPACE$REV()";
}
elsif ($text eq '\\v') {
- $code = "\$C->_VSPACE$REV()";
+ if ($REV) {
+ $code = "\$C->_PATTERN(qr/\\G(?<=[\\n])/)";
+ }
+ else {
+ $code = "\$C->_PATTERN(qr/\\G[\\n]/)";
+ }
+# $code = "\$C->_VSPACE$REV()";
}
elsif ($text eq '»') {
- $code = "\$C->_RIGHTWB$REV()";
+ $code = "\$C->_PATTERN(qr/\\G\\b/)";
+# $code = "\$C->_RIGHTWB$REV()";
}
elsif ($text eq '«') {
- $code = "\$C->_LEFTWB$REV()";
+ $code = "\$C->_PATTERN(qr/\\G\\b/)";
+# $code = "\$C->_LEFTWB$REV()";
}
elsif ($text eq '>>') {
$code = "\$C->_RIGHTWB$REV()";
@@ -1796,8 +1868,23 @@ sub unbalanced {
if ($name eq "sym") {
$$self{sym} = $SYM;
$$self{endsym} = $ENDSYM if $ENDSYM;
+ if ($$self{i}) {
+ return "\$C->_PATTERN(qr/\\G(?i:" . quotemeta($SYM) . ")/)";
+ }
+ else {
+ return "\$C->_PATTERN(qr/\\G" . quotemeta($SYM) . "/)";
+ }
return $re = '$C->_SYM($sym, ' . ($$self{i}//0) . ')'; # could pass endsym too here...
}
+ elsif ($name eq "alpha") {
+ return "\$C->_PATTERN(qr/\\G[_[:alpha:]]/)";
+ }
+ elsif ($name eq "_ALNUM") {
+ return "\$C->_PATTERN(qr/\\G\\w/)";
+ }
+# elsif ($name eq "ws") {
+# return "\$C->_PATTERN(qr/\\G(?{ \$C = \$C->ws; pos(\$_) = \$C->{_pos} })/)";
+# }
elsif ($name eq "nextsame") {
$NEEDORIGARGS++;
$re = '$self->SUPER::' . $NAME . '(@origargs)';
@@ -2016,44 +2103,79 @@ END
my $quant = "";
my $rep = "_REP";
my $q = $$self{quant};
- if ($q) {
- my ($qfer,$how,$rest) = @{$$self{quant}};
- my $h = $how eq '!' ? 'g' :
- $how eq '?' ? 'f' :
- 'r';
- if ($qfer eq '*') {
- $PURE = 0;
- $quant = "\$C->_STAR$h$REV(";
- }
- elsif ($qfer eq '+') {
- $quant = "\$C->_PLUS$h$REV(";
- }
- elsif ($qfer eq '?') {
- $PURE = 0;
- $quant = "\$C->_OPT$h$REV(";
- }
- elsif ($qfer eq '**') {
- if (ref $rest) {
- if (ref $rest eq "RE_block") {
- $PURE = 0;
- $rep = "_REPINDIRECT$REV";
- $rest = $rest->walk();
- }
- else {
- $rep = "_REPSEP$REV";
- $rest = " sub { my \$C=shift;\n" . ::indent($rest->walk()) . "\n}";
- }
- }
- else {
- $PURE = 0 if $rest =~ /^0/;
- $rest = "'$rest'";
- }
- $quant = "\$C->$rep$h( $rest, ";
- }
- $result = $quant . "sub { my \$C=shift;\n" . ::indent($$self{atom}->walk(@_)) . "\n})";
- }
+ my $atom = $$self{atom}->walk(@_);
+ if ($q) {
+ if ($atom =~ s{ ^ \$C->_PATTERN\(qr/\\G(.*?)/\) $ }{(?:$1)}sx) {
+ my ($qfer,$how,$rest) = @{$$self{quant}};
+ my $h = $how eq '!' ? '' :
+ $how eq '?' ? '?' :
+ '+';
+ if ($qfer eq '**') {
+ $h = $how eq '!' ? 'g' :
+ $how eq '?' ? 'f' :
+ 'r';
+ if (ref $rest) {
+ if (ref $rest eq "RE_block") {
+ $PURE = 0;
+ $rep = "_REPINDIRECT$REV";
+ $rest = $rest->walk();
+ }
+ else {
+ $rep = "_REPSEP$REV";
+ $rest = " sub { my \$C=shift;\n" . ::indent($rest->walk()) . "\n}";
+ }
+ }
+ else {
+ $PURE = 0 if $rest =~ /^0/;
+ $rest = "'$rest'";
+ }
+ $quant = "\$C->$rep$h( $rest, ";
+ $result = $quant . "sub { my \$C=shift;\n" . ::indent($atom) . "\n})";
+ }
+ else {
+ $PURE = 0;
+ $result = "\$C->_PATTERN\(qr/\\G($atom$qfer$h)/\)";
+ }
+ }
+ else {
+ my ($qfer,$how,$rest) = @{$$self{quant}};
+ my $h = $how eq '!' ? 'g' :
+ $how eq '?' ? 'f' :
+ 'r';
+ if ($qfer eq '*') {
+ $PURE = 0;
+ $quant = "\$C->_STAR$h$REV(";
+ }
+ elsif ($qfer eq '+') {
+ $quant = "\$C->_PLUS$h$REV(";
+ }
+ elsif ($qfer eq '?') {
+ $PURE = 0;
+ $quant = "\$C->_OPT$h$REV(";
+ }
+ elsif ($qfer eq '**') {
+ if (ref $rest) {
+ if (ref $rest eq "RE_block") {
+ $PURE = 0;
+ $rep = "_REPINDIRECT$REV";
+ $rest = $rest->walk();
+ }
+ else {
+ $rep = "_REPSEP$REV";
+ $rest = " sub { my \$C=shift;\n" . ::indent($rest->walk()) . "\n}";
+ }
+ }
+ else {
+ $PURE = 0 if $rest =~ /^0/;
+ $rest = "'$rest'";
+ }
+ $quant = "\$C->$rep$h( $rest, ";
+ }
+ $result = $quant . "sub { my \$C=shift;\n" . ::indent($atom) . "\n})";
+ }
+ }
else {
- $result = $$self{atom}->walk(@_);
+ $result = $atom;
}
}
else {
@@ -2092,9 +2214,27 @@ END
"\n}, $outer)";
}
else {
- "map({ my \$C=\$_;\n" .
- ::indent($inner) .
- "\n} ($outer)[0])";
+ my $oi = $outer . $inner;
+ if ($oi =~ s{ ^ \$C->_PATTERN\(qr/\\G(.*?)/\) \$C->_PATTERN\(qr/\\G(.*?)/\) $ }{\$C->_PATTERN(qr/\\G$1$2/)}sx) {
+ $oi;
+ }
+ else {
+ my $in = ::indent($inner,2);
+ substr(<<"END",0,-1);
+do {
+ if (my (\$C) = ($outer)[0]) {
+$in;
+ }
+ else {
+ ();
+ }
+}
+END
+
+# "map({ my \$C=\$_;\n" .
+# ::indent($inner) .
+# "\n} ($outer)[0])";
+ }
}
}
View
1  mangle.pl
@@ -1,4 +1,5 @@
package main;
+use utf8;
sub mangle {
my @list = @_;
View
2  tryfoo
@@ -9,5 +9,5 @@ use Encode;
print "Starting...\n";
my $what = 'foo';
my $text = "@ARGV";
-my $r = Perl->new($text)->$what();
+my $r = STD->new($text)->$what();
print Dump($r);
Please sign in to comment.
Something went wrong with that request. Please try again.