Permalink
Browse files

Introduce a match_func abstraction for the LineLexer to use.

It can be implemented by either fastlex.c or slow loop in Python.

Unit tests all pass.

Also rename *_index to *_pos.
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 22, 2017
1 parent ccb0a23 commit 529aaf242a5e72efaa6b8d9c4c1cc6cb8c5829cf
Showing with 78 additions and 88 deletions.
  1. +8 −48 core/lexer.py
  2. +8 −8 native/fastlex.c
  3. +4 −4 osh/lex_gen.py
  4. +9 −24 osh/lex_test.py
  5. +49 −4 osh/parse_lib.py
View
@@ -36,52 +36,15 @@ def CompileAll(pat_list):
return result
def FindLongestMatch(re_list, s, pos):
"""Finds the FIRST match.
NOTE: max() appears to find the FIRST max, which we rely on.
"""
matches = []
for regex, tok_type in re_list:
m = regex.match(s, pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError('no match at position %d: %r' % (pos, s))
end_index, tok_type, tok_val = max(matches, key=lambda m: m[0])
return end_index, tok_type, tok_val
# TODO: LineLexer needs the matcher rather than lexer_def.
class MatchTokenSlow(object):
"""An abstract matcher that doesn't depend on OSH."""
def __init__(self, lexer_def):
self.lexer_def = lexer_def
def __call__(self, lex_mode, line, start_index):
"""Returns (id, end_index)."""
return FindLongestMatch(self.lexer_def[lex_mode], line, start_index)
def MatchTokenFast(lex_mode, line, start_index):
"""Returns (id, end_index)."""
tok_type, end_index = lex.MatchToken(lex_mode.enum_id, line, start_index)
return Id(tok_type), end_index
class LineLexer(object):
def __init__(self, lexer_def, line, arena=None):
def __init__(self, match_func, line, arena=None):
# Compile all regexes
self.lexer_def = {}
self.match_func = match_func
self.arena = arena
self.arena_skip = False # For MaybeUnreadOne
self.last_span_id = -1 # For MaybeUnreadOne
for state, pat_list in lexer_def.items():
self.lexer_def[state] = CompileAll(pat_list)
self.Reset(line, -1) # Invalid arena index to start
def Reset(self, line, line_id):
@@ -128,14 +91,13 @@ def LookAhead(self, lex_mode):
t = ast.token(Id.Unknown_Tok, '', -1) # no span ID
return t
re_list = self.lexer_def[lex_mode]
end_index, tok_type, tok_val = FindLongestMatch(
re_list, self.line, pos)
tok_type, end_pos = self.match_func(lex_mode, self.line, pos)
tok_val = self.line[pos:end_pos]
# NOTE: Instead of hard-coding this token, we could pass it in. This
# one only appears in OUTER state! LookAhead(lex_mode, past_token_type)
if tok_type != Id.WS_Space:
break
pos = end_index
pos = end_pos
return ast.token(tok_type, tok_val) # no location
@@ -146,10 +108,8 @@ def Read(self, lex_mode):
if self.AtEnd():
raise AssertionError('EOF')
re_list = self.lexer_def[lex_mode]
end_index, tok_type, tok_val = FindLongestMatch(
re_list, self.line, self.line_pos)
tok_type, end_pos = self.match_func(lex_mode, self.line, self.line_pos)
tok_val = self.line[self.line_pos:end_pos]
# NOTE: tok_val is redundant, but even in osh.asdl we have some separation
# between data needed for formatting and data needed for execution. Could
@@ -178,7 +138,7 @@ def Read(self, lex_mode):
t = ast.token(tok_type, tok_val, span_id)
self.line_pos = end_index
self.line_pos = end_pos
return t
View
@@ -33,22 +33,22 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
// Doesn't work! signed/unsigned confused?
//Py_ssize_t line_len;
int start_index;
int start_pos;
if (!PyArg_ParseTuple(args, "is#i",
&lex_mode, &line, &line_len, &start_index)) {
&lex_mode, &line, &line_len, &start_pos)) {
return NULL;
}
debug("lex_mode %d, line_len %d, start_index %d\n",
lex_mode, line_len, start_index);
debug("lex_mode %d, line_len %d, start_pos %d\n",
lex_mode, line_len, start_pos);
for (int i = 0; i < line_len; ++i) {
printf("%d c: %c\n", i, line[i]);
}
int id;
int end_index;
MatchToken(lex_mode, line, line_len, start_index, &id, &end_index);
return Py_BuildValue("(ii)", id, end_index);
int end_pos;
MatchToken(lex_mode, line, line_len, start_pos, &id, &end_pos);
return Py_BuildValue("(ii)", id, end_pos);
}
// Rename to TokenMatcher?
@@ -58,7 +58,7 @@ fastlex_MatchToken(PyObject *self, PyObject *args) {
PyMethodDef methods[] = {
{"MatchToken", fastlex_MatchToken, METH_VARARGS,
"(lexer mode, line, start_index) -> (id, end_index)."},
"(lexer mode, line, start_pos) -> (id, end_pos)."},
{NULL, NULL},
};
View
@@ -65,17 +65,17 @@ def main(argv):
print """
inline void MatchToken(int lex_mode, char* line, int line_len, int start_index,
int* id, int* end_index) {
inline void MatchToken(int lex_mode, char* line, int line_len, int start_pos,
int* id, int* end_pos) {
switch (lex_mode) {
case lex_mode__OUTER:
*id = id__Lit_Chars;
//*id = id__Lit_Other;
*end_index = 3;
*end_pos = 3;
break;
case lex_mode__COMMENT:
*id = id__Lit_Other;
*end_index = 5;
*end_pos = 5;
break;
default:
assert(0);
View
@@ -7,14 +7,13 @@
from core import alloc
from core.id_kind import Id, Kind, LookupKind
from core.lexer import CompileAll, Lexer, LineLexer, FindLongestMatch
from core.lexer import CompileAll, Lexer, LineLexer
from core import test_lib
from core.test_lib import TokensEqual
from osh import parse_lib
from osh import ast_ as ast
from osh.lex import LEXER_DEF
from osh import ast_ as ast
lex_mode_e = ast.lex_mode_e
@@ -175,49 +174,49 @@ def assertTokensEqual(self, left, right):
def testReadOuter(self):
# Lines always end with '\n'
l = LineLexer(LEXER_DEF, '')
l = LineLexer(parse_lib._MakeMatcher(), '')
try:
l.Read(lex_mode_e.OUTER)
except AssertionError as e:
print(e)
else:
raise AssertionError('Expected error')
l = LineLexer(LEXER_DEF, '\n')
l = LineLexer(parse_lib._MakeMatcher(), '\n')
self.assertTokensEqual(
ast.token(Id.Op_Newline, '\n'), l.Read(lex_mode_e.OUTER))
def testRead_VS_ARG_UNQ(self):
l = LineLexer(LEXER_DEF, "'hi'")
l = LineLexer(parse_lib._MakeMatcher(), "'hi'")
t = l.Read(lex_mode_e.VS_ARG_UNQ)
self.assertEqual(Id.Left_SingleQuote, t.id)
def testLookAhead(self):
# Lines always end with '\n'
l = LineLexer(LEXER_DEF, '')
l = LineLexer(parse_lib._MakeMatcher(), '')
self.assertTokensEqual(
ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(LEXER_DEF, 'foo')
l = LineLexer(parse_lib._MakeMatcher(), 'foo')
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(LEXER_DEF, 'foo bar')
l = LineLexer(parse_lib._MakeMatcher(), 'foo bar')
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER))
self.assertEqual(
ast.token(Id.Lit_Chars, 'bar'), l.LookAhead(lex_mode_e.OUTER))
# No lookahead; using the cursor!
l = LineLexer(LEXER_DEF, 'func(')
l = LineLexer(parse_lib._MakeMatcher(), 'func(')
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
ast.token(Id.Op_LParen, '('), l.LookAhead(lex_mode_e.OUTER))
l = LineLexer(LEXER_DEF, 'func (')
l = LineLexer(parse_lib._MakeMatcher(), 'func (')
self.assertTokensEqual(
ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER))
self.assertTokensEqual(
@@ -228,20 +227,6 @@ def testLookAhead(self):
DOUBLE_QUOTED_RE = CompileAll(LEXER_DEF[lex_mode_e.DQ])
class FunctionTest(unittest.TestCase):
def testFindLongestMatch(self):
e, tok_type, tok_val = FindLongestMatch(OUTER_RE, ' foo', 2)
self.assertEqual(e, 5)
self.assertEqual(tok_type, Id.Lit_Chars)
self.assertEqual(tok_val, 'foo')
e, tok_type, tok_val = FindLongestMatch(OUTER_RE, ' "foo"', 1)
self.assertEqual(e, 2)
self.assertEqual(tok_type, Id.Left_DoubleQuote)
self.assertEqual(tok_val, '"')
class RegexTest(unittest.TestCase):
def testOuter(self):
View
@@ -6,15 +6,60 @@
from core import lexer
from core import reader
from core.id_kind import Id
from osh import lex
from osh import word_parse
from osh import cmd_parse
# bin/osh should work without compiling fastlex? But we want all the unit
# tests to run with a known version of it.
try:
import fastlex
except ImportError:
fastlex = None
class MatchToken_Slow(object):
"""An abstract matcher that doesn't depend on OSH."""
def __init__(self, lexer_def):
self.lexer_def = {}
for state, pat_list in lexer_def.items():
self.lexer_def[state] = lexer.CompileAll(pat_list)
def __call__(self, lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
re_list = self.lexer_def[lex_mode]
matches = []
for regex, tok_type in re_list:
m = regex.match(line, start_pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError('no match at position %d: %r' % (start_pos, line))
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
return tok_type, end_pos
def MatchToken_Fast(lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
return Id(tok_type), end_pos
def _MakeMatcher():
return MatchToken_Slow(lex.LEXER_DEF)
#if fastlex:
# return MatchToken_Fast
#else:
# return MatchToken_Slow(lex.LEXER_DEF)
def InitLexer(s, arena=None):
"""For tests only."""
line_lexer = lexer.LineLexer(lex.LEXER_DEF, '', arena=arena)
match_func = _MakeMatcher()
line_lexer = lexer.LineLexer(match_func, '', arena=arena)
line_reader = reader.StringLineReader(s, arena=arena)
lx = lexer.Lexer(line_lexer, line_reader)
return line_reader, lx
@@ -40,7 +85,7 @@ def InitLexer(s, arena=None):
def MakeParser(line_reader, arena):
"""Top level parser."""
# AtEnd() is true
line_lexer = lexer.LineLexer(lex.LEXER_DEF, '', arena=arena)
line_lexer = lexer.LineLexer(_MakeMatcher(), '', arena=arena)
lx = lexer.Lexer(line_lexer, line_reader)
w_parser = word_parse.WordParser(lx, line_reader)
c_parser = cmd_parse.CommandParser(w_parser, lx, line_reader, arena=arena)
@@ -59,7 +104,7 @@ def MakeParserForCompletion(code_str, arena=None):
# NOTE: We don't need to use a arena here? Or we need a "scratch arena" that
# doesn't interfere with the rest of the program.
line_reader = reader.StringLineReader(code_str)
line_lexer = lexer.LineLexer(lex.LEXER_DEF, '', arena=arena) # AtEnd() is true
line_lexer = lexer.LineLexer(_MakeMatcher(), '', arena=arena) # AtEnd() is true
lx = lexer.Lexer(line_lexer, line_reader)
w_parser = word_parse.WordParser(lx, line_reader)
c_parser = cmd_parse.CommandParser(w_parser, lx, line_reader, arena=arena)
@@ -68,7 +113,7 @@ def MakeParserForCompletion(code_str, arena=None):
def MakeWordParserForHereDoc(lines, arena):
line_reader = reader.VirtualLineReader(lines, arena)
line_lexer = lexer.LineLexer(lex.LEXER_DEF, '', arena=arena)
line_lexer = lexer.LineLexer(_MakeMatcher(), '', arena=arena)
lx = lexer.Lexer(line_lexer, line_reader)
return word_parse.WordParser(lx, line_reader)

0 comments on commit 529aaf2

Please sign in to comment.