Permalink
Browse files

Got re2c code working in a Python extension.

For this step, the re2c expressions are manually written rather than
generated.

Also:

- assert on out-of-bounds error
- Able to correctly match against NUL for Eof_Real
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 22, 2017
1 parent 529aaf2 commit 1bef1c5151a77bb3af319ecc35bf4ec957b3220a
Showing with 93 additions and 25 deletions.
  1. +2 −2 build/codegen.sh
  2. +6 −2 build/setup.py
  3. +3 −1 native/fastlex.c
  4. +29 −10 native/fastlex_test.py
  5. +47 −6 osh/lex_gen.py
  6. +6 −4 osh/parse_lib.py
View
@@ -60,14 +60,14 @@ all() {
lex-gen-native
# Why do we need this?
rm -f _devbuild/pylibc/x86_64/lex.so
rm -f _devbuild/pylibc/x86_64/fastlex.so
# Note: This also does pylibc, which we don't want.
build/dev.sh all
}
symbols() {
nm _devbuild/pylibc/x86_64/lex.so
nm _devbuild/pylibc/x86_64/fastlex.so
}
# Then the next step is build/dev.sh pylibc?
View
@@ -2,15 +2,19 @@
from distutils.core import setup, Extension
module = Extension('libc',
sources = ['native/libc.c'])
sources = ['native/libc.c'],
undef_macros = ['NDEBUG'])
setup(name = 'libc',
version = '1.0',
description = 'Module for libc functions like fnmatch()',
ext_modules = [module])
# https://stackoverflow.com/questions/4541565/how-can-i-assert-from-python-c-code
module = Extension('fastlex',
sources = ['native/fastlex.c'])
sources = ['native/fastlex.c'],
undef_macros = ['NDEBUG']
)
setup(name = 'fastlex',
version = '1.0',
View
@@ -27,7 +27,7 @@ void debug(const char* fmt, ...) {
static PyObject *
fastlex_MatchToken(PyObject *self, PyObject *args) {
int lex_mode;
const char* line;
unsigned char* line;
int line_len;
// Doesn't work! signed/unsigned confused?
@@ -41,9 +41,11 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
debug("lex_mode %d, line_len %d, start_pos %d\n",
lex_mode, line_len, start_pos);
/*
for (int i = 0; i < line_len; ++i) {
printf("%d c: %c\n", i, line[i]);
}
*/
int id;
int end_pos;
View
@@ -19,24 +19,43 @@
lex_mode_e = ast.lex_mode_e
def MatchToken(lex_mode, line, s):
tok_type, end_index = fastlex.MatchToken(lex_mode.enum_id, line, s)
return Id(tok_type), end_index
def MatchToken(lex_mode, line, start_pos):
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
return Id(tok_type), end_pos
def TokenizeLineOuter(line):
start_pos = 0
while True:
tok_type, end_pos = MatchToken(lex_mode_e.OUTER, line, start_pos)
tok_val = line[start_pos:end_pos]
print('TOK: %s %r\n' % (tok_type, tok_val))
start_pos = end_pos
if end_pos == len(line):
break
class LexTest(unittest.TestCase):
def testMatchToken(self):
print(dir(fastlex))
print lex_mode_e.COMMENT.enum_id
result = MatchToken(lex_mode_e.COMMENT, 'line', 3)
print result
print MatchToken(lex_mode_e.COMMENT, 'line', 3)
print
# Need to be able to pass NUL bytes for EOF.
result = MatchToken(lex_mode_e.OUTER, 'end of file\0', 3)
# TODO: Need to turn Id back?
print result
line = 'end of line\n'
TokenizeLineOuter(line)
line = 'end of file\0'
TokenizeLineOuter(line)
def testOutOfBounds(self):
print MatchToken(lex_mode_e.OUTER, 'line', 3)
# It's an error to point to the end of the buffer! Have to be one behind
# it.
return
print MatchToken(lex_mode_e.OUTER, 'line', 4)
print MatchToken(lex_mode_e.OUTER, 'line', 5)
if __name__ == '__main__':
View
@@ -63,22 +63,63 @@ def main(argv):
# This becomes osh-lex.re2c.c. It is compiled to osh-lex.c and then
# included.
print """
print r"""
/* Common stuff */
/*!re2c
re2c:define:YYCTYPE = "unsigned char";
re2c:yyfill:enable = 0;
re2c:define:YYCURSOR = p;
re2c:define:YYLIMIT = q;
*/
inline void MatchToken(int lex_mode, unsigned char* line, int line_len,
int start_pos, int* id, int* end_pos) {
unsigned char* p = line + start_pos; /* modified by re2c */
unsigned char* q = line + line_len; /* yylimit */
// bounds checking
assert(p < q);
//printf("p: %p q: %p\n", p, q);
inline void MatchToken(int lex_mode, char* line, int line_len, int start_pos,
int* id, int* end_pos) {
switch (lex_mode) {
case lex_mode__OUTER:
*id = id__Lit_Chars;
for (;;) {
/*!re2c
literal_chunk = [a-zA-Z0-9_/.-]+;
var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
comment = [ \t\r]* "#" [^\000\r\n]*;
space = [ \t\r]+;
nul = "\000";
literal_chunk { *id = id__Lit_Chars; break; }
var_like { *id = id__Lit_VarLike; break; }
[ \t\r]* "\n" { *id = id__Op_Newline; break; }
space { *id = id__WS_Space; break; }
nul { *id = id__Eof_Real; break; }
// anything else
* { *id = id__Lit_Other; break; }
*/
}
//*id = id__Lit_Other;
*end_pos = 3;
*end_pos = p - line; /* relative */
break;
case lex_mode__COMMENT:
*id = id__Lit_Other;
*end_pos = 5;
*end_pos = 6;
break;
default:
assert(0);
}
}
"""
View
@@ -48,12 +48,14 @@ def MatchToken_Fast(lex_mode, line, start_pos):
def _MakeMatcher():
# NOTE: Could have an environment variable to control this for speed?
return MatchToken_Slow(lex.LEXER_DEF)
#if fastlex:
# return MatchToken_Fast
#else:
# return MatchToken_Slow(lex.LEXER_DEF)
if fastlex:
return MatchToken_Fast
else:
return MatchToken_Slow(lex.LEXER_DEF)
def InitLexer(s, arena=None):

0 comments on commit 1bef1c5

Please sign in to comment.