Permalink
Browse files

[lexer] fix pos() bug when callbacks changed the token length; retab

  • Loading branch information...
1 parent 1878689 commit 3430736e3e73a4045451722ca6cd113a1805873a @moritz committed Jan 7, 2010
Showing with 81 additions and 75 deletions.
  1. +70 −65 lib/Mowyw/Lexer.pm
  2. +1 −1 mowyw.1.txt
  3. +10 −9 t/lexer.t
View
@@ -9,22 +9,22 @@ package Mowyw::Lexer;
Mowyw::Lexer - Simple Lexer
=head1 SYNOPSIS
-
- use Mowyw::Lexer qw(lex);
- # suppose you want to parse simple math expressions
- my @input_tokens = (
- ['Int', qr/(?:-|\+)?\d+/],
- ['Op', qr/\+|\*|-|\//],
- ['Brace_Open', qr/\(/],
- ['Brace_Close', qr/\)/],
- ['Whitespace', qr/\s/, sub { return undef; }],
- );
- my $text = "-12 * (3+4)";
- foreach (lex($text, \@input_tokens){
- my ($name, $text, $position, $line) = @$_;
- print "Found Token $name: '$text'\n"
+
+ use Mowyw::Lexer qw(lex);
+ # suppose you want to parse simple math expressions
+ my @input_tokens = (
+ ['Int', qr/(?:-|\+)?\d+/],
+ ['Op', qr/\+|\*|-|\//],
+ ['Brace_Open', qr/\(/],
+ ['Brace_Close', qr/\)/],
+ ['Whitespace', qr/\s/, sub { return undef; }],
+ );
+ my $text = "-12 * (3+4)";
+ foreach (lex($text, \@input_tokens){
+ my ($name, $text, $position, $line) = @$_;
+ print "Found Token $name: '$text'\n"
print " at position $position line $line\n";
- }
+ }
=head1 DESCRIPTION
@@ -68,7 +68,7 @@ our @EXPORT = qw(lex);
our %EXPORT_TAGS = (":all" => \@EXPORT);
sub lex {
- my ($text, $tokens) = @_;
+ my ($text, $tokens) = @_;
my ($last_line_number, $last_pos) = (0, 0);
my $pos_and_line_number = sub {
@@ -79,75 +79,80 @@ sub lex {
return ($pos, $last_line_number + 1);
};
- return () unless length $text;
- my @res;
+ return () unless length $text;
+ my @res;
pos($text) = 0;
- while (pos($text) < length($text)){
- my $matched = 0;
- # try to match at the start of $text
- foreach (@$tokens){
- my $re = $_->[1];
- if ($text =~ m#\G($re)#gc){
- $matched = 1;
- my $match = $1;
- die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
- if (my $fun = $_->[2]){
- $match = $fun->($match);
- }
- if (defined $match){
- push @res, [$_->[0],
+ while (pos($text) < length($text)){
+ my $matched = 0;
+ # try to match at the start of $text
+ foreach (@$tokens){
+ my $re = $_->[1];
+ if ($text =~ m#\G($re)#gc){
+ $matched = 1;
+ my $match = $1;
+ die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
+ my $token_pos = pos($text) - length($match);
+ if (my $fun = $_->[2]){
+ $match = $fun->($match);
+ }
+ if (defined $match){
+ push @res, [$_->[0],
$match,
- $pos_and_line_number->(pos($text) - length($match)),
+ $pos_and_line_number->($token_pos),
];
- }
- last;
- }
- }
- unless ($matched){
- my $next_token;
+ }
+ last;
+ }
+ }
+ unless ($matched){
+ my $next_token;
my $next_token_match;
my $match;
- my $min = length($text);
+ my $min = length($text);
my $pos = pos($text);
# find the token that matches first
- foreach(@$tokens){
- pos($text) = $pos;
- my $re = $_->[1];
- if ($text =~ m#\G((?s:.)*?)($re)#gc){
- if ($+[1] < $min){
- $min = $+[1];
- $next_token = $_;
+ my $token_pos;
+ foreach(@$tokens){
+ my $re = $_->[1];
+ if ($text =~ m#\G((?s:.)*?)($re)#gc){
+ if ($+[1] < $min){
+ $min = $+[1];
+ $next_token = $_;
$next_token_match = $2;
- $match = $1;
- }
- }
- }
- if (defined $match){
- push @res, ['UNMATCHED',
+ $match = $1;
+ $token_pos = pos($text) - length($match);
+ }
+ }
+ pos($text) = $pos;
+ }
+ if (defined $match){
+ push @res, ['UNMATCHED',
$match,
- $pos_and_line_number->($pos - length($pos))
+ $pos_and_line_number->($token_pos - length($match)),
];
- die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
+ die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
unless (length($next_token_match) > 0);
- if (my $fun = $next_token->[2]){
- $match = $fun->($match);
- }
- push @res, [$next_token->[0],
+ if (my $fun = $next_token->[2]){
+ $match = $fun->($match);
+ }
+ push @res, [$next_token->[0],
$next_token_match,
$pos_and_line_number->($min),
] if defined $match;
pos($text) = $min + length($next_token_match);
- } else {
- push @res, ['UNMATCHED',
+ } else {
+ push @res, ['UNMATCHED',
substr($text, $pos),
$pos_and_line_number->($pos)
];
pos($text) = length($text);
- }
- }
- }
- return @res;
+ }
+ }
+ }
+ return @res;
}
-1;
+
+# vim: sw=4 ts=4 expandtab
View
@@ -312,7 +312,7 @@ it was written with generality in mind, there are still some restrictions that
were accepted as a tradeoff for simplicity.
* Error messages aren't very useful for the uninitiated, though the line
- number makes it pretty obvious where to search for for the error
+ number makes it pretty obvious where to search for for the error.
* Disk space: mowyw keeps two complete copies of a project on disk, the
source files and the resulting online files. This might not be optimal, and
consumes unnecessary space for files that are not processed anyway.
View
@@ -5,28 +5,29 @@ use Mowyw::Lexer qw(lex);
use Test::More tests => 13;
my @tokens = (
- ['Int', qr/(?:-|\+)?\d+/],
+ ['Int', qr/(?:-|\+)?\d+/, sub { 2 * $_[0]}],
['Op', qr/\+|\*|-|\//],
['Brace_Open', qr/\(/],
['Brace_Close', qr/\)/],
['Whitespace', qr/\s+/, sub { return undef; }],
);
-my $text = "12 + foo\n (3 * (4 + -1))BAR";
+
+my $text = "12 + foo\n (3 * (60 + -1))BAR";
my @expected = split /\n/, <<EXPECTED;
-Int: 12 (0; 1)
+Int: 24 (0; 1)
Op: + (3; 1)
UNMATCHED: foo (4; 1)
Brace_Open: ( (10; 2)
-Int: 3 (11; 2)
+Int: 6 (11; 2)
Op: * (13; 2)
Brace_Open: ( (15; 2)
-Int: 4 (16; 2)
-Op: + (18; 2)
-Int: -1 (20; 2)
-Brace_Close: ) (22; 2)
+Int: 120 (16; 2)
+Op: + (19; 2)
+Int: -2 (21; 2)
Brace_Close: ) (23; 2)
-UNMATCHED: BAR (24; 2)
+Brace_Close: ) (24; 2)
+UNMATCHED: BAR (25; 2)
EXPECTED
for (lex($text, \@tokens)) {

0 comments on commit 3430736

Please sign in to comment.