Permalink
Browse files

Translate ECHO_E_LEXER to a native fastlex.MatchEchoToken() function.

It's not hooked up to anything yet.

- Rename MatchToken() -> MatchOshToken()
- Found redundant rule that matches \\$.
  • Loading branch information...
Andy Chu
Andy Chu committed May 29, 2018
1 parent 57bb691 commit f7fde14130909d3e867ea5b5b90d66e05c600c62
Showing with 92 additions and 44 deletions.
  1. +45 −10 core/lexer_gen.py
  2. +6 −1 core/word_compile.py
  3. +29 −15 native/fastlex.c
  4. +10 −9 native/fastlex_test.py
  5. +0 −4 osh/lex.py
  6. +2 −5 osh/match.py
View
@@ -167,7 +167,7 @@ def TranslateTree(re_tree, f, in_char_class=False):
f.write('.')
else:
raise AssertionError(name)
raise RuntimeError("I don't understand regex construct: %r" % name)
# NOTE: negate and not_literal are sort of duplicated
@@ -178,7 +178,11 @@ def TranslateRegex(pat):
#import pprint
#print(pprint.pformat(re_tree), file=sys.stderr)
f = cStringIO.StringIO()
TranslateTree(re_tree, f)
try:
TranslateTree(re_tree, f)
except RuntimeError:
print('Error translating %r' % pat, file=sys.stderr)
raise
return f.getvalue()
@@ -192,7 +196,37 @@ def TranslateRegex(pat):
# http://re2c.org/examples/example_03.html
def TranslateLexer(lexer_def):
def TranslateEcholexer(echo_def):
print(r"""
static inline void MatchEchoToken(unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
assert(start_pos <= line_len); /* caller should have checked */
unsigned char* p = line + start_pos; /* modified by re2c */
unsigned char* YYMARKER; /* why do we need this? */
for (;;) {
/*!re2c
""")
for is_regex, pat, token_id in echo_def:
if is_regex:
re2c_pat = TranslateRegex(pat)
else:
re2c_pat = TranslateConstant(pat)
id_name = meta.IdName(token_id)
print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
print("""
*/
}
*end_pos = p - line; /* relative */
}
""")
def TranslateOshLexer(lexer_def):
# https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
# Has to be 'static inline' rather than 'inline', otherwise the
# _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
@@ -208,7 +242,7 @@ def TranslateLexer(lexer_def):
re2c:yyfill:enable = 0; // generated code doesn't ask for more input
*/
static inline void MatchToken(int lex_mode, unsigned char* line, int line_len,
static inline void MatchOshToken(int lex_mode, unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
assert(start_pos <= line_len); /* caller should have checked */
@@ -231,11 +265,11 @@ def TranslateLexer(lexer_def):
for is_regex, pat, token_id in pat_list:
if is_regex:
re2_pat = TranslateRegex(pat)
re2c_pat = TranslateRegex(pat)
else:
re2_pat = TranslateConstant(pat)
re2c_pat = TranslateConstant(pat)
id_name = meta.IdName(token_id)
print(' %-30s { *id = id__%s; break; }' % (re2_pat, id_name))
print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
# EARLY RETURN: Do NOT advance past the NUL terminator.
print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
@@ -293,8 +327,8 @@ def TranslateRegexToPredicate(py_regex, func_name):
re2c_pat = TranslateRegex(py_regex)
print(r"""
static inline int %s(const char* s, int len) {
unsigned char* p = s; /* modified by re2c */
unsigned char* end = s + len;
const char* p = s; /* modified by re2c */
const char* end = s + len;
/*!re2c
re2c:define:YYCURSOR = p;
@@ -315,7 +349,8 @@ def main(argv):
action = argv[1]
if action == 'c':
# Print code to stdout.
TranslateLexer(lex.LEXER_DEF)
TranslateOshLexer(lex.LEXER_DEF)
TranslateEcholexer(lex.ECHO_E_DEF)
TranslateRegexToPredicate(lex.VAR_NAME_RE, 'IsValidVarName')
TranslateRegexToPredicate(pretty.PLAIN_WORD_RE, 'IsPlainWord')
View
@@ -6,6 +6,8 @@
doesn't depend on any values at runtime.
"""
from core import util
from osh.meta import Id
from osh.meta import runtime
@@ -43,7 +45,10 @@ def EvalCStringToken(id_, value):
if id_ == Id.Char_Literals:
return value
elif id_ == Id.Char_BadBackslash: # TODO: error in strict mode
elif id_ == Id.Char_BadBackslash:
if 1: # TODO: error in strict mode
# Either \A or trailing \ (A is not a valid backslash escape)
util.warn('Invalid backslash escape')
return value
elif id_ == Id.Char_OneChar:
View
@@ -25,13 +25,11 @@ static void debug(const char* fmt, ...) {
#endif
static PyObject *
fastlex_MatchToken(PyObject *self, PyObject *args) {
fastlex_MatchOshToken(PyObject *self, PyObject *args) {
int lex_mode;
unsigned char* line;
unsigned char* line;
int line_len;
// Doesn't work! signed/unsigned confused?
//Py_ssize_t line_len;
int start_pos;
if (!PyArg_ParseTuple(args, "is#i",
@@ -43,25 +41,39 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
// Eol_Tok is inserted everywhere.
if (start_pos > line_len) {
PyErr_Format(PyExc_ValueError,
"Invalid MatchToken call (start_pos = %d, line_len = %d)",
"Invalid MatchOshToken call (start_pos = %d, line_len = %d)",
start_pos, line_len);
return NULL;
}
/*
debug("lex_mode %d, line_len %d, start_pos %d\n",
lex_mode, line_len, start_pos);
*/
int id;
int end_pos;
MatchOshToken(lex_mode, line, line_len, start_pos, &id, &end_pos);
return Py_BuildValue("(ii)", id, end_pos);
}
static PyObject *
fastlex_MatchEchoToken(PyObject *self, PyObject *args) {
unsigned char* line;
int line_len;
int start_pos;
if (!PyArg_ParseTuple(args, "s#i", &line, &line_len, &start_pos)) {
return NULL;
}
/*
for (int i = 0; i < line_len; ++i) {
printf("%d c: %c\n", i, line[i]);
// bounds checking. It's OK to be called with a start_pos looking at \0.
// Eol_Tok is inserted everywhere.
if (start_pos > line_len) {
PyErr_Format(PyExc_ValueError,
"Invalid MatchEchoToken call (start_pos = %d, line_len = %d)",
start_pos, line_len);
return NULL;
}
*/
int id;
int end_pos;
MatchToken(lex_mode, line, line_len, start_pos, &id, &end_pos);
MatchEchoToken(line, line_len, start_pos, &id, &end_pos);
return Py_BuildValue("(ii)", id, end_pos);
}
@@ -89,8 +101,10 @@ fastlex_IsPlainWord(PyObject *self, PyObject *args) {
static PyMethodDef methods[] = {
{"MatchToken", fastlex_MatchToken, METH_VARARGS,
{"MatchOshToken", fastlex_MatchOshToken, METH_VARARGS,
"(lexer mode, line, start_pos) -> (id, end_pos)."},
{"MatchEchoToken", fastlex_MatchEchoToken, METH_VARARGS,
"(line, start_pos) -> (id, end_pos)."},
{"IsValidVarName", fastlex_IsValidVarName, METH_VARARGS,
"Is it a valid var name?"},
{"IsPlainWord", fastlex_IsPlainWord, METH_VARARGS,
View
@@ -19,15 +19,16 @@
lex_mode_e = types.lex_mode_e
def MatchToken(lex_mode, line, start_pos):
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
# NOTE: This is just like _MatchOshToken_Fast in osh/match.py
def MatchOshToken(lex_mode, line, start_pos):
tok_type, end_pos = fastlex.MatchOshToken(lex_mode.enum_id, line, start_pos)
return IdInstance(tok_type), end_pos
def TokenizeLineOuter(line):
start_pos = 0
while True:
tok_type, end_pos = MatchToken(lex_mode_e.OUTER, line, start_pos)
tok_type, end_pos = MatchOshToken(lex_mode_e.OUTER, line, start_pos)
tok_val = line[start_pos:end_pos]
print('TOK: %s %r\n' % (tok_type, tok_val))
start_pos = end_pos
@@ -38,9 +39,9 @@ def TokenizeLineOuter(line):
class LexTest(unittest.TestCase):
def testMatchToken(self):
def testMatchOshToken(self):
print(dir(fastlex))
print(MatchToken(lex_mode_e.COMMENT, 'line', 3))
print(MatchOshToken(lex_mode_e.COMMENT, 'line', 3))
print()
# Need to be able to pass NUL bytes for EOF.
@@ -50,18 +51,18 @@ def testMatchToken(self):
TokenizeLineOuter(line)
def testOutOfBounds(self):
print(MatchToken(lex_mode_e.OUTER, 'line', 3))
print(MatchOshToken(lex_mode_e.OUTER, 'line', 3))
# It's an error to point to the end of the buffer! Have to be one behind
# it.
return
print(MatchToken(lex_mode_e.OUTER, 'line', 4))
print(MatchToken(lex_mode_e.OUTER, 'line', 5))
print(MatchOshToken(lex_mode_e.OUTER, 'line', 4))
print(MatchOshToken(lex_mode_e.OUTER, 'line', 5))
def testBug(self):
code_str = '-n'
expected = Id.BoolUnary_n
tok_type, end_pos = MatchToken(lex_mode_e.DBRACKET, code_str, 0)
tok_type, end_pos = MatchOshToken(lex_mode_e.DBRACKET, code_str, 0)
print('---', 'expected', expected.enum_value, 'got', tok_type.enum_value)
self.assertEqual(expected, tok_type)
View
@@ -416,10 +416,6 @@ def IsKeyword(name):
C(r'\c', Id.Char_Stop),
# Bad Backslash should not end the string. We allow it, but a lint tool
# should warn about it.
R(r'\\$', Id.Char_BadBackslash),
# e.g. 'foo', anything that's not a backslash escape
R(r'[^\\]+', Id.Char_Literals),
]
View
@@ -56,25 +56,22 @@ def __call__(self, lex_mode, line, start_pos):
def _MatchOshToken_Fast(lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
tok_type, end_pos = fastlex.MatchOshToken(lex_mode.enum_id, line, start_pos)
# IMPORTANT: We're reusing Id instances here. Ids are very common, so this
# saves memory.
return IdInstance(tok_type), end_pos
# TODO:
#
# MatchToken -> MatchOshToken()
# Add MatchEchoToken()
#
# MATCHER -> OSH_MATCHER
# add ECHO_MATCHER
# TODO: Make this the slow path!
class SimpleLexer(object):
"""Lexer for echo -e, which interprets C-escaped strings.
Based on osh/parse_lib.py MatchToken_Slow.
Based on osh/parse_lib.py MatchOshToken_Slow.
"""
def __init__(self, pat_list):
self.pat_list = _CompileAll(pat_list)

0 comments on commit f7fde14

Please sign in to comment.