/
Lexer.pm
115 lines (97 loc) · 3.24 KB
/
Lexer.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl
use warnings;
use strict;
package Mowyw::Lexer;
=pod
=head1 NAME
Mowyw::Lexer - Simple Lexer
=head1 SYNOPSIS
use Mowyw::Lexer qw(lex);
# suppose you want to parse simple math expressions
my @input_tokens = (
['Int', qr/(?:-|\+)?\d+/],
['Op', qr/\+|\*|-|\//],
['Brace_Open', qr/\(/],
['Brace_Close', qr/\)/],
['Whitespace', qr/\s/, sub { return undef; }],
);
my $text = "-12 * (3+4)";
foreach (lex($text, \@input_tokens){
my ($name, $text) = @$_;
print "Found Token $name: $text\n";
}
=head1 DESCRIPTION
Mowyw::Lexer is a simple lexer that breaks up a text into tokens, depenending on the input tokens you provide
The only exported method is lex($$), which expects input text as its first argument and a array ref to list of input tokens.
Each input token consists of a token name (which you can choose freely), a regexwhich matches the desired token, and optionally a reference to a functions that takes the matched token text as its argument. The token text is replaced by the return value of that function. If the function returns undef, that token will not be included in the list of output tokens.
lex() returns a list of output tokens, each output token is a reference to a list which contains the token name and the matched text.
If there is unmatched text, it is returned with the token name UNMATCHED.
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2007 by Moritz Lenz, http://moritz.faui2k3.org, moritz@faui2k3.org
This Program and its Documentation is free software. You may distribute it under the same terms as perl itself.
However all code examples are to be public domain, so you can use it in any way you want to.
=cut
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(lex);
our %EXPORT_TAGS = (":all" => \@EXPORT);
sub lex($$){
my ($text, $tokens) = @_;
return () unless (length $text);
my @res;
while (length($text) > 0){
my $matched = 0;
# try to match at the start of $text
foreach (@$tokens){
my $re = $_->[1];
if ($text =~ m#^$re#s){
$matched = 1;
my $match = $&;
die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
if (my $fun = $_->[2]){
$match = &$fun($match);
}
if (defined $match){
push @res, [$_->[0], $&];
}
$text = $';
last;
}
}
unless ($matched){
# no token matched, look what the closest token is
# my $remain = substr $text, 0, 30;
# die "No Token matched the remaining String which starts as $remain\n";
my $next_token;
my $min = length($text);
foreach(@$tokens){
my $re = $_->[1];
if ($text =~ m#$re#s){
if (length($`) < $min){
$min = length($`);
$next_token = $_;
$matched = 1;
}
}
}
if ($matched){
my $re = $next_token->[1];
$text =~ m#$re#;
push @res, ['UNMATCHED', $`];
my $match = $&;
die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
if (my $fun =$next_token->[2]){
$match = &$fun($match);
}
push @res, [$next_token->[0], $match] if (defined $match);
$text = $';
} else {
push @res, ['UNMATCHED', $text];
# print "END OF FILE IS UNMATCHED: $text\n";
$text = "";
}
}
}
return @res;
}
1;