@@ -1,13 +1,47 @@
"""
lex.py -- Shell lexer.
It consists of a series of lexical modes, each with regex -> Id mappings .
It consists of a series of lexer modes, each with a regex -> Id mapping .
TODO :
- \0 should be Id.Op_Newline or Id.WS_Newline. And then the higher level Lexer
should return the Id.Eof_Real token, as it does now.
NOTE : Input Handling Affects Regular Expressions
------------------------------------------------
Remaining constructs:
We pass one line at a time to the Lexer, via LineLexer. We must be able to
parse one line at a time because of interactive parsing (e.g. using the output
of GNU readline.)
There are two ways we could handle input:
1. Every line is NUL terminated:
'one\n\0 ' 'last line\0 '
2. Every line is terminated by NUL, except the last:
'one\n ' 'last line\0 '
The advantage of #2 is that in the common case of reading files, we don't have
to do it one line at a time. We could slurp the whole file in, or mmap() it,
etc.
The second option makes the regular expressions more complicated, so I'm
punting on it for now. We assume the first.
That means:
- No regexes below should match \0 . They are added by
core/lexer_gen.py for re2c.
For example, [^']+ is not valid. [^'\0 ]+ is correct. Otherwise we would read
unitialized memory past the sentinel.
Python's regex engine knows where the end of the input string is, so it
doesn't require need a sentinel like \0 .
Note that re2c is not able to work in a mode with a strict length limit. It
would cause too many extra checks? The language is then no longer regular!
http://re2c.org/examples/example_03.html
Remaining constructs
--------------------
Case terminators:
;;& Op_DSemiAmp for case
@@ -56,7 +90,7 @@
# Explicitly exclude newline, although '.' would work too
_BACKSLASH = [
R(r ' \\ [^ \n ]' , Id.Lit_EscapedChar),
R(r ' \\ [^ \n\0 ]' , Id.Lit_EscapedChar),
C(' \\\n ' , Id.Ignored_LineCont),
]
@@ -141,7 +175,6 @@
R(r ' [ \t\r ]+ ' , Id.WS_Space ),
C(' \0 ' , Id.Eof_Real), # TODO : Remove?
C(' \n ' , Id.Op_Newline),
C(' &' , Id.Op_Amp),
@@ -251,10 +284,9 @@ def IsKeyword(name):
# Example: echo @(<> <>|&&|'foo'|$bar)
LEXER_DEF [lex_mode_e.EXTGLOB ] = \
_BACKSLASH + _LEFT_SUBS + _VARS + _EXTGLOB_BEGIN + [
R(r ' [^ \\ $`" \' |)@*+!? ]+ ' , Id.Lit_Chars),
R(r ' [^ \\ $`" \' |)@*+!? \0 ]+ ' , Id.Lit_Chars),
C(' |' , Id.Op_Pipe),
C(' )' , Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
C(' \0 ' , Id.Eof_Real),
R(' .' , Id.Lit_Other), # everything else is literal
]
@@ -279,7 +311,6 @@ def IsKeyword(name):
R(r ' [^ $`" \0\\ ]+ ' , Id.Lit_Chars), # matches a line at most
# NOTE : When parsing here doc line, this token doesn't end it.
C(' "' , Id.Right_DoubleQuote),
C(' \0 ' , Id.Eof_Real),
R(r ' . ' , Id.Lit_Other), # e.g. "$"
]
@@ -295,7 +326,6 @@ def IsKeyword(name):
_VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
# NOTE : added < and > so it doesn't eat <()
R(r ' [^ $`/}" \'\0\\ #%<> ]+ ' , Id.Lit_Chars),
C(' \0 ' , Id.Eof_Real),
R(r ' . ' , Id.Lit_Other), # e.g. "$", must be last
]
@@ -304,25 +334,22 @@ def IsKeyword(name):
R(r ' [^ $`/}" \0\\ #% ]+ ' , Id.Lit_Chars), # matches a line at most
# Weird wart: even in double quoted state, double quotes are allowed
C(' "' , Id.Left_DoubleQuote),
C(' \0 ' , Id.Eof_Real),
R(r ' . ' , Id.Lit_Other), # e.g. "$", must be last
]
# NOTE : Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
# state.
LEXER_DEF [lex_mode_e.SQ ] = [
R(r " [^ ' ]+ " , Id.Lit_Chars), # matches a line at most
R(r " [^ ' \0 ]+ " , Id.Lit_Chars), # matches a line at most
C(" '" , Id.Right_SingleQuote),
C(' \0 ' , Id.Eof_Real),
]
# NOTE : Id.Ignored_LineCont is also not supported here, even though the whole
# point of it is that supports other backslash escapes like \n!
LEXER_DEF [lex_mode_e.DOLLAR_SQ ] = [
R(r " [^ ' \\ ]+ " , Id.Lit_Chars),
R(r " [^ ' \\\0 ]+ " , Id.Lit_Chars),
R(r " \\ . " , Id.Lit_EscapedChar),
C(" '" , Id.Right_SingleQuote),
C(' \0 ' , Id.Eof_Real),
]
LEXER_DEF [lex_mode_e.VS_1 ] = [
@@ -341,7 +368,6 @@ def IsKeyword(name):
C(' \\\n ' , Id.Ignored_LineCont),
C(' \0 ' , Id.Eof_Real), # not used?
C(' \n ' , Id.Unknown_Tok), # newline not allowed inside ${}
R(r ' . ' , Id.Unknown_Tok), # any char except newline
]
0 comments on commit
16b11b9