Permalink
Browse files

Turn VAR_NAME_RE into a predicate IsValidVarName, and lower to re2c.

Introduce a new osh/match.py module to encapsulate fastlex.c (which
should really be named fastmatch.c).

It follows the same pattern as the lexer:

1. osh/match.py calls
2. native/fastlex.c when available, which calls
3. _devbuild/gen/osh-lex.h (could be osh-match.h)
4. which is generated by core/lexer_gen.py.

When fastlex isn't available, we use Python regular expressions.
  • Loading branch information...
Andy Chu
Andy Chu committed May 26, 2018
1 parent 86ec43d commit 10a13490208d45eec8d44f150a85524f08eac8da
Showing with 134 additions and 88 deletions.
  1. +18 −1 core/lexer_gen.py
  2. +13 −5 native/fastlex.c
  3. +10 −0 native/fastlex_test.py
  4. +3 −4 osh/cmd_parse.py
  5. +6 −12 osh/lex.py
  6. +8 −7 osh/lex_test.py
  7. +68 −0 osh/match.py
  8. +7 −58 osh/parse_lib.py
  9. +1 −1 scripts/count.sh
View
@@ -205,7 +205,7 @@ def TranslateLexer(lexer_def):
*/
static inline void MatchToken(int lex_mode, unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
int start_pos, int* id, int* end_pos) {
assert(start_pos <= line_len); /* caller should have checked */
unsigned char* p = line + start_pos; /* modified by re2c */
@@ -285,6 +285,21 @@ def TranslateLexer(lexer_def):
}
""")
def TranslateOther(var_name_re):
re2_pat = TranslateRegex(var_name_re)
print(r"""
static inline int IsValidVarName(const char* s, int len) {
unsigned char* p = s; /* modified by re2c */
unsigned char* end = s + len;
/*!re2c
re2c:define:YYCURSOR = p;
%-30s { return p == end; } // Match must be anchored right, like $
* { return 0; }
*/
}
""" % re2_pat)
# note: use YYCURSOR and YYLIMIT
# limit should be the end of string
@@ -295,7 +310,9 @@ def main(argv):
action = argv[1]
if action == 'c':
# Print code to stdout.
TranslateLexer(lex.LEXER_DEF)
TranslateOther(lex.VAR_NAME_RE)
elif action == 'print-all':
# Top level is a switch statement.
View
@@ -43,7 +43,7 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
// Eol_Tok is inserted everywhere.
if (start_pos > line_len) {
PyErr_Format(PyExc_ValueError,
"Invalid MatchToken call (start_pos = %d, line_len =%d)",
"Invalid MatchToken call (start_pos = %d, line_len = %d)",
start_pos, line_len);
return NULL;
}
@@ -65,14 +65,22 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
return Py_BuildValue("(ii)", id, end_pos);
}
// Rename to TokenMatcher?
// LineLexer holds CharMatcher? or TokenMatcher?
// SlowTokenMatcher
// FastTokenMatcher
static PyObject *
fastlex_IsValidVarName(PyObject *self, PyObject *args) {
const char *name;
int len;
if (!PyArg_ParseTuple(args, "s#", &name, &len)) {
return NULL;
}
return PyBool_FromLong(IsValidVarName(name, len));
}
static PyMethodDef methods[] = {
{"MatchToken", fastlex_MatchToken, METH_VARARGS,
"(lexer mode, line, start_pos) -> (id, end_pos)."},
{"IsValidVarName", fastlex_IsValidVarName, METH_VARARGS,
"Is it a valid var name?"},
{NULL, NULL},
};
View
@@ -66,6 +66,16 @@ def testBug(self):
self.assertEqual(expected, tok_type)
def testIsValidVarName(self):
self.assertEqual(True, fastlex.IsValidVarName('abc'))
self.assertEqual(True, fastlex.IsValidVarName('foo_bar'))
self.assertEqual(True, fastlex.IsValidVarName('_'))
self.assertEqual(False, fastlex.IsValidVarName(''))
self.assertEqual(False, fastlex.IsValidVarName('-x'))
self.assertEqual(False, fastlex.IsValidVarName('x-'))
self.assertEqual(False, fastlex.IsValidVarName('var_name-foo'))
if __name__ == '__main__':
unittest.main()
View
@@ -16,8 +16,8 @@
from core import word
from core import util
from osh import match
from osh.meta import ast, Id, Kind, types
from osh.lex import VAR_NAME_RE
from osh.bool_parse import BoolParser
log = util.log
@@ -417,8 +417,7 @@ def _MakeAssignment(self, assign_kw, suffix_words):
return None
# No value is equivalent to ''
m = VAR_NAME_RE.match(static_val)
if not m:
if not match.IsValidVarName(static_val):
self.AddErrorContext('Invalid variable name %r', static_val, word=w)
return None
a = (static_val, assign_op_e.Equal, None, left_spid)
@@ -735,7 +734,7 @@ def _ParseForEachLoop(self):
self.AddErrorContext(
"Invalid for loop variable", word=self.cur_word)
return None
if not VAR_NAME_RE.match(iter_name):
if not match.IsValidVarName(iter_name):
self.AddErrorContext(
"Invalid for loop variable name", word=self.cur_word)
return None
View
@@ -88,7 +88,7 @@
Left Index:
_VAR_NAME_RE + '\[' Lit_LeftIndexLikeOpen
VAR_NAME_RE + '\[' Lit_LeftIndexLikeOpen
]= Lit_LeftIndexLikeClose
Indexed array and Associative array literals:
@@ -100,8 +100,6 @@
Op_LBracket Op_RBracketEqual
"""
import re
from osh.meta import Id, Kind, ID_SPEC
from core.lexer import C, R
@@ -132,16 +130,12 @@
C('\\\n', Id.Ignored_LineCont),
]
_VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
# Used by osh/cmd_parse.py to validate for loop name. Note it must be
# anchored on the right.
VAR_NAME_RE = re.compile(_VAR_NAME_RE + '$')
VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
# All Kind.VSub
_VARS = [
# Unbraced variables
R(r'\$' + _VAR_NAME_RE, Id.VSub_Name),
R(r'\$' + VAR_NAME_RE, Id.VSub_Name),
R(r'\$[0-9]', Id.VSub_Number),
C(r'$!', Id.VSub_Bang),
C(r'$@', Id.VSub_At),
@@ -205,7 +199,7 @@
C('#', Id.Lit_Pound), # For comments
# Needs to be LONGER than any other
#(_VAR_NAME_RE + r'\[', Id.Lit_Maybe_LHS_ARRAY),
#(VAR_NAME_RE + r'\[', Id.Lit_Maybe_LHS_ARRAY),
# Id.Lit_Maybe_LHS_ARRAY2
#(r'\]\+?=', Id.Lit_Maybe_ARRAY_ASSIGN_RIGHT),
@@ -454,7 +448,7 @@ def IsKeyword(name):
]
LEXER_DEF[lex_mode_e.VS_1] = [
R(_VAR_NAME_RE, Id.VSub_Name),
R(VAR_NAME_RE, Id.VSub_Name),
# ${11} is valid, compared to $11 which is $1 and then literal 1.
R(r'[0-9]+', Id.VSub_Number),
C('!', Id.VSub_Bang),
@@ -496,7 +490,7 @@ def IsKeyword(name):
# 0123
# A separate digits token makes this easier to parse STATICALLY. But this
# doesn't help with DYNAMIC parsing.
R(_VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
R(r'[0-9]+', Id.Lit_Digits),
C('@', Id.Lit_At), # for 64#@ or ${a[@]}
C('#', Id.Lit_Pound), # for 64#a
View
@@ -9,6 +9,7 @@
from core.lexer import CompileAll, LineLexer
from core import test_lib
from osh import match
from osh import parse_lib
from osh.meta import ast, Id, Kind, LookupKind, types
from osh.lex import LEXER_DEF
@@ -185,41 +186,41 @@ def assertTokensEqual(self, left, right):
self.assertTrue(test_lib.TokensEqual(left, right))
def testReadOuter(self):
l = LineLexer(parse_lib._MakeMatcher(), '\n', self.arena)
l = LineLexer(match.MakeMatcher(), '\n', self.arena)
self.assertTokensEqual(
ast.token(Id.Op_Newline, '\n'), l.Read(lex_mode_e.OUTER))
def testRead_VS_ARG_UNQ(self):
l = LineLexer(parse_lib._MakeMatcher(), "'hi'", self.arena)
l = LineLexer(match.MakeMatcher(), "'hi'", self.arena)
t = l.Read(lex_mode_e.VS_ARG_UNQ)
self.assertEqual(Id.Left_SingleQuote, t.id)
def testLookAhead(self):
# Lines always end with '\n'
l = LineLexer(parse_lib._MakeMatcher(), '', self.arena)
l = LineLexer(match.MakeMatcher(), '', self.arena)
self.assertTokensEqual(
ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(parse_lib._MakeMatcher(), 'foo', self.arena)
l = LineLexer(match.MakeMatcher(), 'foo', self.arena)
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(parse_lib._MakeMatcher(), 'foo bar', self.arena)
l = LineLexer(match.MakeMatcher(), 'foo bar', self.arena)
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'bar'), l.LookAhead(lex_mode_e.OUTER))
# No lookahead; using the cursor!
l = LineLexer(parse_lib._MakeMatcher(), 'func(', self.arena)
l = LineLexer(match.MakeMatcher(), 'func(', self.arena)
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
ast.token(Id.Op_LParen, '('), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(parse_lib._MakeMatcher(), 'func (', self.arena)
l = LineLexer(match.MakeMatcher(), 'func (', self.arena)
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
View
@@ -0,0 +1,68 @@
#!/usr/bin/python
"""
match.py - match with generated re2c code or Python regexes.
"""
from core import lexer
from osh import lex
from osh.meta import Id, IdInstance
# bin/osh should work without compiling fastlex? But we want all the unit
# tests to run with a known version of it.
try:
import fastlex
except ImportError:
fastlex = None
class _MatchToken_Slow(object):
"""An abstract matcher that doesn't depend on OSH."""
def __init__(self, lexer_def):
self.lexer_def = {}
for state, pat_list in lexer_def.items():
self.lexer_def[state] = lexer.CompileAll(pat_list)
def __call__(self, lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
# Simulate the EOL handling in re2c.
if start_pos >= len(line):
return Id.Eol_Tok, start_pos
re_list = self.lexer_def[lex_mode]
matches = []
for regex, tok_type in re_list:
m = regex.match(line, start_pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError('no match at position %d: %r' % (start_pos, line))
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
return tok_type, end_pos
def _MatchToken_Fast(lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
# IMPORTANT: We're reusing Id instances here. Ids are very common, so this
# saves memory.
return IdInstance(tok_type), end_pos
def MakeMatcher():
if fastlex:
return _MatchToken_Fast
else:
return _MatchToken_Slow(lex.LEXER_DEF)
if fastlex:
IsValidVarName = fastlex.IsValidVarName
else:
import re
# Used by osh/cmd_parse.py to validate for loop name. Note it must be
# anchored on the right.
_VAR_NAME_RE = re.compile(lex.VAR_NAME_RE + '$')
def IsValidVarName(s):
return _VAR_NAME_RE.match(s)
Oops, something went wrong.

0 comments on commit 10a1349

Please sign in to comment.