Permalink
Browse files

Lexer optimization: Return Eol_Tok at the end of the line.

We don't want to check AtEnd() on every lexer.Read() call.

We rely on the fact that there are no NUL bytes in program text.  When
we see one, we know we're at the end of the line, and can return
Eol_Tok.

Also:

- Removed unused function in generated Python code.
- Shell functions to profile the biggest shell program.
  • Loading branch information...
Andy Chu
Andy Chu committed Dec 20, 2017
1 parent 18a17ed commit 34f0658fd9ef71eeaa6468c3c83c45c8f85f267f
Showing with 36 additions and 48 deletions.
  1. +0 −12 asdl/gen_python.py
  2. +6 −3 benchmarks/pytrace.sh
  3. +1 −1 core/id_kind.py
  4. +8 −11 core/lexer.py
  5. +10 −11 core/lexer_gen.py
  6. +10 −0 native/fastlex.c
  7. +1 −1 native/fastlex_test.py
  8. +0 −9 osh/lex_test.py
View
@@ -15,14 +15,6 @@
class GenClassesVisitor(gen_cpp.AsdlVisitor):
# TODO:
# - DESCRIPTOR and FIELDS are dummies right now.
# - I think FIELDS is used for encoding.
#
# - Debug mode:
# - _CheckType(value, desc) on initialization and __setattr__.
# - check unassigned. Why is it done with unit tests with CheckUnassigned,
# but also in _Init?
def VisitSimpleSum(self, sum, name, depth):
self.Emit('class %s_e(py_meta.SimpleObj):' % name, depth)
@@ -88,10 +80,6 @@ def _GenClass(self, desc, name, super_name, depth, tag_num=None,
self.Emit('', depth)
self.Emit(' def CheckUnassigned(self):', depth)
self.Emit(' pass', depth)
self.Emit('', depth)
def VisitConstructor(self, cons, def_name, tag_num, depth):
if cons.fields:
self._GenClass(cons, cons.name, def_name, depth, tag_num=tag_num)
View
@@ -11,7 +11,8 @@ set -o nounset
set -o pipefail
set -o errexit
readonly ABUILD=~/git/alpine/abuild/abuild
readonly BIGGEST=benchmarks/testdata/configure-coreutils
readonly ABUILD=benchmarks/testdata/abuild
readonly -a RUN_ABUILD=(bin/oil.py osh $ABUILD -h)
readonly -a OSH_PARSE=(bin/oil.py osh --ast-format none -n)
@@ -24,6 +25,8 @@ readonly -a OSH_PARSE=(bin/oil.py osh --ast-format none -n)
time-run-abuild() { time "${RUN_ABUILD[@]}"; }
time-parse-abuild() { time "${OSH_PARSE[@]}" $ABUILD; }
time-parse-biggest() { time "${OSH_PARSE[@]}" $BIGGEST; }
_cprofile() {
local out=$1
shift
@@ -41,8 +44,8 @@ cprofile-osh-parse() {
cprofile-parse-abuild() {
cprofile-osh-parse $ABUILD _tmp/abuild.cprofile
}
cprofile-parse-configure() {
cprofile-osh-parse benchmarks/testdata/configure _tmp/configure.cprofile
cprofile-parse-biggest() {
cprofile-osh-parse $BIGGEST _tmp/biggest.cprofile
}
cprofile-run-abuild() {
_cprofile _tmp/abuild-run.cprofile "${RUN_ABUILD[@]}"
View
@@ -176,7 +176,7 @@ def _AddKinds(spec):
# TODO: Unknown_Tok is OK, but Undefined_Id is better
spec.AddKind('Undefined', ['Tok']) # for initial state
spec.AddKind('Unknown', ['Tok']) # for when nothing matches
spec.AddKind('Eol', ['Tok']) # no more tokens on line (\0)
spec.AddKind('Eol', ['Tok']) # no more tokens on line (\0)
spec.AddKind('Eof', ['Real', 'RParen', 'Backtick'])
View
@@ -45,9 +45,10 @@ def __init__(self, match_func, line, arena=None):
self.arena_skip = False # For MaybeUnreadOne
self.last_span_id = const.NO_INTEGER # For MaybeUnreadOne
self.Reset(line, -1) # Invalid arena index to start
self.Reset(line, -1) # Invalid line_id to start
def Reset(self, line, line_id):
#assert line, repr(line) # can't be empty or None
self.line = line
self.line_pos = 0
self.line_id = line_id
@@ -101,14 +102,11 @@ def LookAhead(self, lex_mode):
return ast.token(tok_type, tok_val, const.NO_INTEGER)
def AtEnd(self):
return self.line_pos == len(self.line)
def Read(self, lex_mode):
if self.AtEnd():
raise AssertionError('EOF')
#assert self.line_pos <= len(self.line), (self.line, self.line_pos)
tok_type, end_pos = self.match_func(lex_mode, self.line, self.line_pos)
#assert end_pos <= len(self.line)
tok_val = self.line[self.line_pos:end_pos]
# NOTE: tok_val is redundant, but even in osh.asdl we have some separation
@@ -199,7 +197,8 @@ def PushHint(self, old_id, new_id):
self.translation_stack.append((old_id, new_id))
def _Read(self, lex_mode):
if self.line_lexer.AtEnd():
t = self.line_lexer.Read(lex_mode)
if t.id == Id.Eol_Tok: # hit \0
line_id, line = self.line_reader.GetLine()
if line is None: # no more lines
@@ -208,16 +207,14 @@ def _Read(self, lex_mode):
return t
self.line_lexer.Reset(line, line_id)
t = self.line_lexer.Read(lex_mode)
t = self.line_lexer.Read(lex_mode)
# e.g. translate ) or ` into EOF
if self.translation_stack:
old_id, new_id = self.translation_stack[-1] # top
if t.id == old_id:
#print('==> TRANSLATING %s ==> %s' % (t, new_s))
self.translation_stack.pop()
#print(self.translation_stack)
t.id = new_id
return t
View
@@ -197,13 +197,7 @@ def TranslateLexer(lexer_def):
static inline void MatchToken(int lex_mode, unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
// bounds checking
if (start_pos >= line_len) {
fprintf(stderr, "start_pos %d line_len %d\n", start_pos, line_len);
assert(0);
}
//assert(start_pos < line_len);
assert(start_pos <= line_len); /* caller should have checked */
unsigned char* p = line + start_pos; /* modified by re2c */
//printf("p: %p q: %p\n", p, q);
@@ -212,7 +206,8 @@ def TranslateLexer(lexer_def):
switch (lex_mode) {
"""
# TODO: Should be ordered by most common?
# TODO: Should be ordered by most common? Or will profile-directed feedback
# help?
for state, pat_list in lexer_def.iteritems():
# HACK: strip off '_e'
@@ -261,8 +256,7 @@ def TranslateLexer(lexer_def):
*/
}
//*id = id__Lit_Other;
*end_pos = p - line; /* relative */
*end_pos = p - line;
break;
case lex_mode__COMMENT:
@@ -276,7 +270,12 @@ def TranslateLexer(lexer_def):
assert(0);
}
*end_pos = p - line; /* relative */
if (*id == id__Eol_Tok) {
/* don't move past if Eol_Tok */
*end_pos = start_pos;
} else {
*end_pos = p - line; /* relative */
}
}
"""
View
@@ -38,6 +38,16 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
&lex_mode, &line, &line_len, &start_pos)) {
return NULL;
}
// bounds checking. It's OK to be called with a start_pos looking at \0.
// Eol_Tok is inserted everywhere.
if (start_pos > line_len) {
PyErr_Format(PyExc_ValueError,
"Invalid MatchToken call (start_pos = %d, line_len =%d)",
start_pos, line_len);
return NULL;
}
/*
debug("lex_mode %d, line_len %d, start_pos %d\n",
lex_mode, line_len, start_pos);
View
@@ -33,7 +33,7 @@ def TokenizeLineOuter(line):
print('TOK: %s %r\n' % (tok_type, tok_val))
start_pos = end_pos
if end_pos == len(line):
if tok_type == Id.Eol_Tok:
break
View
@@ -173,15 +173,6 @@ def assertTokensEqual(self, left, right):
self.assertTrue(TokensEqual(left, right))
def testReadOuter(self):
# Lines always end with '\n'
l = LineLexer(parse_lib._MakeMatcher(), '')
try:
l.Read(lex_mode_e.OUTER)
except AssertionError as e:
print(e)
else:
raise AssertionError('Expected error')
l = LineLexer(parse_lib._MakeMatcher(), '\n')
self.assertTokensEqual(
ast.token(Id.Op_Newline, '\n'), l.Read(lex_mode_e.OUTER))

0 comments on commit 34f0658

Please sign in to comment.