Permalink
Browse files

Successfully translated the lexer to re2c and compiled the output.

A basic 'echo hi' program works, but there is a bug parsing 'configure'
which I still have to look into.

Add size profiling of the resulting code.  21 KiB seems reasonable for
now.
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 22, 2017
1 parent 1bef1c5 commit 107d0776069c4fa5dd466412f79a3a717aa2036f
Showing with 397 additions and 155 deletions.
  1. +31 −6 build/codegen.sh
  2. +316 −0 core/lexer_gen.py
  3. +48 −0 core/lexer_gen_test.py
  4. +2 −0 native/fastlex.c
  5. +0 −148 osh/lex_gen.py
  6. +0 −1 osh/parse_lib.py
View
@@ -42,22 +42,29 @@ id-gen() {
PYTHONPATH=. core/id_kind_gen.py c | tee _build/gen/id.h
}
lexer-gen() {
PYTHONPATH=. core/lexer_gen.py "$@"
}
# _gen/osh_lex.re2c.c
# This includes osh_ast.h
lex-gen() {
PYTHONPATH=. osh/lex_gen.py "$@" | tee _build/gen/osh-lex.re2c.h
osh-lex-gen() {
lexer-gen c | tee _build/gen/osh-lex.re2c.h
}
print-regex() { lexer-gen print-regex; }
print-all() { lexer-gen print-all; }
# re2c native.
lex-gen-native() {
osh-lex-gen-native() {
re2c -o _build/gen/osh-lex.h _build/gen/osh-lex.re2c.h
}
all() {
ast-gen
id-gen
lex-gen
lex-gen-native
osh-lex-gen
osh-lex-gen-native
# Why do we need this?
rm -f _devbuild/pylibc/x86_64/fastlex.so
@@ -66,8 +73,26 @@ all() {
build/dev.sh all
}
# Size profiler for binaries. TODO: Fold this into benchmarks/
bloaty() {
~/git/other/bloaty/bloaty "$@"
}
symbols() {
nm _devbuild/pylibc/x86_64/fastlex.so
local obj=_devbuild/pylibc/x86_64/fastlex.so
nm $obj
echo
bloaty $obj
echo
# fastlex_MatchToken is 21.2 KiB. That doesn't seem to large compared ot
# the 14K line output?
bloaty -d symbols $obj
echo
ls -l $obj
echo
}
# Then the next step is build/dev.sh pylibc?
View
@@ -0,0 +1,316 @@
#!/usr/bin/python
"""
lex_gen.py
"""
import cStringIO
import sys
import sre_parse
from osh import lex
def PrintTree(re_tree, depth=2):
"""
re_tree: List of children
"""
for child in re_tree:
name, arg = child
sys.stdout.write(depth * '\t')
sys.stdout.write(name)
sys.stdout.write(' ')
if name == 'in': # character class
print '{'
PrintTree(arg, depth=depth+1)
sys.stdout.write(depth * '\t')
print '}'
elif name == 'max_repeat': # repetition
min_, max_, children = arg
# min = 0 means *, min = 1 means +
assert min_ in (0, 1), min_
print min_, max_, '{'
PrintTree(children, depth=depth+1)
sys.stdout.write(depth * '\t')
print
elif name == 'negate': # Oh this is a ^. It doesn't form a node.
assert arg is None
print
elif name == 'literal': # Quote \ and " in re2c syntax
print repr(chr(arg))
elif name == 'not_literal': # ditto
print repr(chr(arg))
elif name == 'range': # ascii range
begin, end = arg
print repr(chr(begin)), repr(chr(end))
elif name == 'any': # This is the '.' character
assert arg is None
print
else:
raise AssertionError(name)
# NOTE: negate and not_literal are sort of duplicated
def PrintRegex(pat):
re_tree = sre_parse.parse(pat)
print '\t\t['
PrintTree(re_tree)
print '\t\t]'
# ^ means negation, - means range
CHAR_CLASS_META = ['\\', '^', '-', ']']
CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
# re2c literals are inside double quotes, so we don't need to do anything with
# ^ or whatever.
LITERAL_META = ['\\', '"']
LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
def _CharClassLiteral(arg):
if arg == 0:
s = r'\x00' # "\x00"
elif arg == ord('\n'):
s = r'\n'
elif arg == ord('\r'):
s = r'\r'
elif arg == ord('\t'):
s = r'\t'
elif arg in CHAR_CLASS_META_CODES:
s = '\\' + chr(arg)
else:
s = chr(arg)
return s
def _Literal(arg):
if arg == 0:
s = r'\x00' # "\000"
elif arg == ord('\n'):
s = r'\n'
elif arg == ord('\r'):
s = r'\r'
elif arg == ord('\t'):
s = r'\t'
elif arg in LITERAL_META_CODES:
s = '\\' + chr(arg)
else:
s = chr(arg)
return s
def TranslateConstant(pat):
return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
def TranslateTree(re_tree, f, in_char_class=False):
"""
re_tree: List of children
"""
for child in re_tree:
name, arg = child
if name == 'in': # character class
f.write('[')
TranslateTree(arg, f, in_char_class=True) # list of literals/ranges
f.write(']')
elif name == 'max_repeat': # repetition
min_, max_, children = arg
# min = 0 means *, min = 1 means +
assert min_ in (0, 1), min_
TranslateTree(children, f)
if min_ == 0:
if max_ == 1:
f.write('? ')
else:
f.write('* ')
elif min_ == 1:
f.write('+ ')
else:
assert 0, min_
elif name == 'negate': # ^ in [^a-z]
assert arg is None
f.write('^')
elif name == 'literal': # Quote \ and " in re2c syntax
# TODO: it matters if we're inside a character class
#print("literal ARG %r" % arg)
if in_char_class:
s = _CharClassLiteral(arg)
else:
s = '"%s" ' % _Literal(arg)
f.write(s)
elif name == 'not_literal': # ditto
assert not in_char_class
f.write('[^%s]' % _CharClassLiteral(arg))
elif name == 'range': # ascii range
begin, end = arg
f.write('%s-%s' % (chr(begin), chr(end)))
elif name == 'any': # This is the '.' character
assert arg is None
f.write('.')
else:
raise AssertionError(name)
# NOTE: negate and not_literal are sort of duplicated
def TranslateRegex(pat):
re_tree = sre_parse.parse(pat)
f = cStringIO.StringIO()
TranslateTree(re_tree, f)
return f.getvalue()
def TranslateLexer(lexer_def):
print r"""
/* Common stuff */
/*!re2c
re2c:define:YYCTYPE = "unsigned char";
re2c:yyfill:enable = 0;
re2c:define:YYCURSOR = p;
re2c:define:YYLIMIT = q;
*/
inline void MatchToken(int lex_mode, unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
unsigned char* p = line + start_pos; /* modified by re2c */
unsigned char* q = line + line_len; /* yylimit */
// bounds checking
assert(p < q);
//printf("p: %p q: %p\n", p, q);
unsigned char* YYMARKER; /* why do we need this? */
switch (lex_mode) {
"""
# TODO: Should be ordered by most common?
for state, pat_list in lexer_def.iteritems():
# HACK: strip off '_e'
prefix = state.__class__.__name__[:-2]
print ' case %s__%s:' % (prefix, state.name)
print ' for (;;) {'
print ' /*!re2c'
for is_regex, pat, token_id in pat_list:
if is_regex:
re2_pat = TranslateRegex(pat)
else:
re2_pat = TranslateConstant(pat)
print ' %-30s { *id = id__%s; break; }' % (re2_pat, token_id)
print ' */'
print ' }'
print ' break;'
print
# This is literal code without generation:
"""
case lex_mode__OUTER:
for (;;) {
/*!re2c
literal_chunk = [a-zA-Z0-9_/.-]+;
var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
comment = [ \t\r]* "#" [^\000\r\n]*;
space = [ \t\r]+;
nul = "\000";
literal_chunk { *id = id__Lit_Chars; break; }
var_like { *id = id__Lit_VarLike; break; }
[ \t\r]* "\n" { *id = id__Op_Newline; break; }
space { *id = id__WS_Space; break; }
nul { *id = id__Eof_Real; break; }
// anything else
* { *id = id__Lit_Other; break; }
*/
}
//*id = id__Lit_Other;
*end_pos = p - line; /* relative */
break;
case lex_mode__COMMENT:
*id = id__Lit_Other;
*end_pos = 6;
break;
"""
print """\
default:
assert(0);
}
*end_pos = p - line; /* relative */
}
"""
# note: use YYCURSOR and YYLIMIT
# limit should be the end of string
# line + line_len
def main(argv):
# This becomes osh-lex.re2c.c. It is compiled to osh-lex.c and then
# included.
action = argv[1]
if action == 'c':
TranslateLexer(lex.LEXER_DEF)
elif action == 'print-all':
# Top level is a switch statement.
for state, pat_list in lex.LEXER_DEF.iteritems():
print state
# This level is re2c patterns.
for is_regex, pat, token_id in pat_list:
print '\t%r -> %r' % (pat, token_id)
if is_regex:
#print re_tree
out_pat = TranslateRegex(pat)
#print out_pat
print
elif action == 'print-regex':
unique = set()
num_regexes = 0
for state, pat_list in lex.LEXER_DEF.iteritems():
print state
# This level is re2c patterns.
for is_regex, pat, token_id in pat_list:
#print '\t%r -> %r' % (pat, token_id)
if is_regex:
print '\t' + pat
print '\t' + TranslateRegex(pat)
print
#PrintRegex(pat)
num_regexes += 1
unique.add(pat)
else:
print '\t' + TranslateConstant(pat)
print
print 'Printed %d regexes (%d unique)' % (num_regexes, len(unique))
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print >>sys.stderr, 'FATAL: %s' % e
sys.exit(1)
Oops, something went wrong.

0 comments on commit 107d077

Please sign in to comment.