Permalink
Browse files

Turn on re2c warnings and make them fatal.

- Add an explicit Id.Eol_Tok for the \0 sentinel.  All lexer modes
  automatically match \0.
- Turn the . pattern into [^\0] everywhere.
- Fix errors caught by re2c exhaustiveness checks!
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 25, 2017
1 parent f2343a0 commit 35c2cf30873a9ac1471fb95148dc19b4e09e951d
Showing with 35 additions and 21 deletions.
  1. +7 −7 build/codegen.sh
  2. +1 −0 core/id_kind.py
  3. +1 −0 core/lexer_gen.py
  4. +26 −14 osh/lex.py
View
@@ -44,9 +44,7 @@ install-re2c() {
make
}
re2c() {
_deps/re2c-1.0.3/re2c "$@"
}
re2c() { _deps/re2c-1.0.3/re2c "$@"; }
ast-gen() {
PYTHONPATH=. osh/ast_gen.py "$@" | tee _build/gen/osh-ast.h
@@ -71,7 +69,11 @@ print-all() { lexer-gen print-all; }
# re2c native.
osh-lex-gen-native() {
re2c -o _build/gen/osh-lex.h _build/gen/osh-lex.re2c.h
# Turn on all warnings and make them native.
# The COMMENT state can match an empty string at the end of a line, e.g.
# '#\n'. So we have to turn that warning off.
re2c -W -Wno-match-empty-string -Werror \
-o _build/gen/osh-lex.h _build/gen/osh-lex.re2c.h
}
all() {
@@ -88,9 +90,7 @@ all() {
}
# Size profiler for binaries. TODO: Fold this into benchmarks/
bloaty() {
~/git/other/bloaty/bloaty "$@"
}
bloaty() { ~/git/other/bloaty/bloaty "$@"; }
symbols() {
local obj=_devbuild/py-ext/x86_64/fastlex.so
View
@@ -157,6 +157,7 @@ def _AddKinds(spec):
# TODO: Unknown_Tok is OK, but Undefined_Id is better
spec.AddKind('Undefined', ['Tok']) # for initial state
spec.AddKind('Unknown', ['Tok']) # for when nothing matches
spec.AddKind('Eol', ['Tok']) # no more tokens on line (\0)
spec.AddKind('Eof', ['Real', 'RParen', 'Backtick'])
View
@@ -224,6 +224,7 @@ def TranslateLexer(lexer_def):
from core import id_kind
id_name = id_kind.IdName(token_id)
print ' %-30s { *id = id__%s; break; }' % (re2_pat, id_name)
print ' %-30s { *id = id__%s; break; }' % (r'"\x00"', 'Eol_Tok')
print ' */'
print ' }'
print ' break;'
View
@@ -88,7 +88,6 @@
# chain the groups in order. It might make sense to experiment with the order
# too.
# Explicitly exclude newline, although '.' would work too
_BACKSLASH = [
R(r'\\[^\n\0]', Id.Lit_EscapedChar),
C('\\\n', Id.Ignored_LineCont),
@@ -149,9 +148,11 @@
LEXER_DEF = {} # TODO: Should be a list so we enforce order.
# Anything until the end of the line is a comment.
# Anything until the end of the line is a comment. Does not match the newline
# itself. We want to switch modes and possibly process Op_Newline for here
# docs, etc.
LEXER_DEF[lex_mode_e.COMMENT] = [
R(r'.*', Id.Ignored_Comment) # does not match newline
R(r'[^\n\0]*', Id.Ignored_Comment)
]
_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
@@ -199,7 +200,7 @@
R(r'[0-9]*<>', Id.Redir_LessGreat),
R(r'[0-9]*>\|', Id.Redir_Clobber),
R(r'.', Id.Lit_Other), # any other single char is a literal
R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
]
# In OUTER and DBRACKET states.
@@ -287,9 +288,10 @@ def IsKeyword(name):
R(r'[^\\$`"\'|)@*+!?\0]+', Id.Lit_Chars),
C('|', Id.Op_Pipe),
C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
R('.', Id.Lit_Other), # everything else is literal
R(r'[^\0]', Id.Lit_Other), # everything else is literal
]
LEXER_DEF[lex_mode_e.BASH_REGEX] = [
# Match these literals first, and then the rest of the OUTER state I guess.
# That's how bash works.
@@ -300,7 +302,13 @@ def IsKeyword(name):
C('(', Id.Lit_Chars),
C(')', Id.Lit_Chars),
C('|', Id.Lit_Chars),
] + _UNQUOTED
] + [
# Avoid "unreachable rule error"
(is_regex, pat, re_list) for
(is_regex, pat, re_list) in _UNQUOTED
if not (is_regex == False and pat in ('(', ')', '|'))
]
LEXER_DEF[lex_mode_e.DQ] = [
# Only 4 characters are backslash escaped inside "".
@@ -311,7 +319,7 @@ def IsKeyword(name):
R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
# NOTE: When parsing here doc line, this token doesn't end it.
C('"', Id.Right_DoubleQuote),
R(r'.', Id.Lit_Other), # e.g. "$"
R(r'[^\0]', Id.Lit_Other), # e.g. "$"
]
_VS_ARG_COMMON = _BACKSLASH + [
@@ -326,15 +334,15 @@ def IsKeyword(name):
_VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
# NOTE: added < and > so it doesn't eat <()
R(r'[^$`/}"\'\0\\#%<>]+', Id.Lit_Chars),
R(r'.', Id.Lit_Other), # e.g. "$", must be last
R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
]
# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
LEXER_DEF[lex_mode_e.VS_ARG_DQ] = _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
# Weird wart: even in double quoted state, double quotes are allowed
C('"', Id.Left_DoubleQuote),
R(r'.', Id.Lit_Other), # e.g. "$", must be last
R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
]
# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
@@ -345,11 +353,15 @@ def IsKeyword(name):
]
# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
# point of it is that supports other backslash escapes like \n!
# point of it is that supports other backslash escapes like \n! It just
# becomes a regular backslash.
LEXER_DEF[lex_mode_e.DOLLAR_SQ] = [
R(r"[^'\\\0]+", Id.Lit_Chars),
R(r"\\.", Id.Lit_EscapedChar),
C("'", Id.Right_SingleQuote),
R(r"\\[^\0]", Id.Lit_EscapedChar),
# Backslash that ends the file! Caught by re2c exhaustiveness check. For
# now, make it Unknown.
C('\\\0', Id.Unknown_Tok),
]
LEXER_DEF[lex_mode_e.VS_1] = [
@@ -369,7 +381,7 @@ def IsKeyword(name):
C('\\\n', Id.Ignored_LineCont),
C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
R(r'.', Id.Unknown_Tok), # any char except newline
R(r'[^\0]', Id.Unknown_Tok), # any char except newline
]
LEXER_DEF[lex_mode_e.VS_2] = \
@@ -380,7 +392,7 @@ def IsKeyword(name):
C('\\\n', Id.Ignored_LineCont),
C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
R(r'.', Id.Unknown_Tok), # any char except newline
R(r'[^\0]', Id.Unknown_Tok), # any char except newline
]
# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
@@ -403,7 +415,7 @@ def IsKeyword(name):
# TODO: 64#@ interferes with VS_AT. Hm.
] + ID_SPEC.LexerPairs(Kind.Arith) + [
C('\\\n', Id.Ignored_LineCont),
R(r'.', Id.Unknown_Tok) # any char. This should be a syntax error.
R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
]
# Notes on BASH_REGEX states

0 comments on commit 35c2cf3

Please sign in to comment.