/
Lexer.pm
153 lines (131 loc) · 4.45 KB
/
Lexer.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/perl
use warnings;
use strict;
package Mowyw::Lexer;
=pod
=head1 NAME
Mowyw::Lexer - Simple Lexer
=head1 SYNOPSIS
use Mowyw::Lexer qw(lex);
# suppose you want to parse simple math expressions
my @input_tokens = (
['Int', qr/(?:-|\+)?\d+/],
['Op', qr/\+|\*|-|\//],
['Brace_Open', qr/\(/],
['Brace_Close', qr/\)/],
['Whitespace', qr/\s/, sub { return undef; }],
);
my $text = "-12 * (3+4)";
foreach (lex($text, \@input_tokens){
my ($name, $text, $position, $line) = @$_;
print "Found Token $name: '$text'\n"
print " at position $position line $line\n";
}
=head1 DESCRIPTION
Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to
regexes you provide.
The only exported subroutine is C<lex>, which expects input text as its first
argument, and a array references as second argument, which contains arrays of
token names and regexes.
Each input token consists of a token name (which you can choose freely), a
regexwhich matches the desired token, and optionally a reference to a
functions that takes the matched token text as its argument. The token text is
replaced by the return value of that function. If the function returns undef,
that token will not be included in the list of output tokens.
C<lex> returns a list of output tokens, each output token is a reference to a
list which contains the token name, matched text, position of the match in the
input string (zero-based, suitable for passing to C<substr>), and line number
of the start of the match (one-based, suitable for humans).
If there is unmatched text, it is returned with the token name C<UNMATCHED>.
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org
This Program and its Documentation is free software. You may distribute it
under the terms of the Artistic License 2.0 as published by The Perl
Foundation.
However all code examples are public domain, so you can use it in any way you
want to.
=cut
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(lex);
our %EXPORT_TAGS = (":all" => \@EXPORT);
sub lex {
my ($text, $tokens) = @_;
my ($last_line_number, $last_pos) = (0, 0);
my $pos_and_line_number = sub {
my $pos = shift;
$last_line_number +=
(substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
$last_pos = $pos;
return ($pos, $last_line_number + 1);
};
return () unless length $text;
my @res;
pos($text) = 0;
while (pos($text) < length($text)){
my $matched = 0;
# try to match at the start of $text
foreach (@$tokens){
my $re = $_->[1];
if ($text =~ m#\G($re)#gc){
$matched = 1;
my $match = $1;
die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
if (my $fun = $_->[2]){
$match = $fun->($match);
}
if (defined $match){
push @res, [$_->[0],
$match,
$pos_and_line_number->(pos($text) - length($match)),
];
}
last;
}
}
unless ($matched){
my $next_token;
my $next_token_match;
my $match;
my $min = length($text);
my $pos = pos($text);
# find the token that matches first
foreach(@$tokens){
pos($text) = $pos;
my $re = $_->[1];
if ($text =~ m#\G((?s:.)*?)($re)#gc){
if ($+[1] < $min){
$min = $+[1];
$next_token = $_;
$next_token_match = $2;
$match = $1;
}
}
}
if (defined $match){
push @res, ['UNMATCHED',
$match,
$pos_and_line_number->($pos - length($pos))
];
die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
unless (length($next_token_match) > 0);
if (my $fun = $next_token->[2]){
$match = $fun->($match);
}
push @res, [$next_token->[0],
$next_token_match,
$pos_and_line_number->($min),
] if defined $match;
pos($text) = $min + length($next_token_match);
} else {
push @res, ['UNMATCHED',
substr($text, $pos),
$pos_and_line_number->($pos)
];
pos($text) = length($text);
}
}
}
return @res;
}
-1;