|
|
@@ -36,52 +36,15 @@ def CompileAll(pat_list): |
|
|
return result
|
|
|
|
|
|
|
|
|
def FindLongestMatch(re_list, s, pos):
|
|
|
"""Finds the FIRST match.
|
|
|
|
|
|
NOTE: max() appears to find the FIRST max, which we rely on.
|
|
|
"""
|
|
|
matches = []
|
|
|
for regex, tok_type in re_list:
|
|
|
m = regex.match(s, pos) # left-anchored
|
|
|
if m:
|
|
|
matches.append((m.end(0), tok_type, m.group(0)))
|
|
|
if not matches:
|
|
|
raise AssertionError('no match at position %d: %r' % (pos, s))
|
|
|
end_index, tok_type, tok_val = max(matches, key=lambda m: m[0])
|
|
|
return end_index, tok_type, tok_val
|
|
|
|
|
|
|
|
|
# TODO: LineLexer needs the matcher rather than lexer_def.
|
|
|
|
|
|
class MatchTokenSlow(object):
|
|
|
"""An abstract matcher that doesn't depend on OSH."""
|
|
|
def __init__(self, lexer_def):
|
|
|
self.lexer_def = lexer_def
|
|
|
|
|
|
def __call__(self, lex_mode, line, start_index):
|
|
|
"""Returns (id, end_index)."""
|
|
|
return FindLongestMatch(self.lexer_def[lex_mode], line, start_index)
|
|
|
|
|
|
|
|
|
def MatchTokenFast(lex_mode, line, start_index):
|
|
|
"""Returns (id, end_index)."""
|
|
|
tok_type, end_index = lex.MatchToken(lex_mode.enum_id, line, start_index)
|
|
|
return Id(tok_type), end_index
|
|
|
|
|
|
|
|
|
class LineLexer(object):
|
|
|
def __init__(self, lexer_def, line, arena=None):
|
|
|
def __init__(self, match_func, line, arena=None):
|
|
|
# Compile all regexes
|
|
|
self.lexer_def = {}
|
|
|
self.match_func = match_func
|
|
|
self.arena = arena
|
|
|
|
|
|
self.arena_skip = False # For MaybeUnreadOne
|
|
|
self.last_span_id = -1 # For MaybeUnreadOne
|
|
|
|
|
|
for state, pat_list in lexer_def.items():
|
|
|
self.lexer_def[state] = CompileAll(pat_list)
|
|
|
|
|
|
self.Reset(line, -1) # Invalid arena index to start
|
|
|
|
|
|
def Reset(self, line, line_id):
|
|
|
@@ -128,14 +91,13 @@ def LookAhead(self, lex_mode): |
|
|
t = ast.token(Id.Unknown_Tok, '', -1) # no span ID
|
|
|
return t
|
|
|
|
|
|
re_list = self.lexer_def[lex_mode]
|
|
|
end_index, tok_type, tok_val = FindLongestMatch(
|
|
|
re_list, self.line, pos)
|
|
|
tok_type, end_pos = self.match_func(lex_mode, self.line, pos)
|
|
|
tok_val = self.line[pos:end_pos]
|
|
|
# NOTE: Instead of hard-coding this token, we could pass it in. This
|
|
|
# one only appears in OUTER state! LookAhead(lex_mode, past_token_type)
|
|
|
if tok_type != Id.WS_Space:
|
|
|
break
|
|
|
pos = end_index
|
|
|
pos = end_pos
|
|
|
|
|
|
return ast.token(tok_type, tok_val) # no location
|
|
|
|
|
|
@@ -146,10 +108,8 @@ def Read(self, lex_mode): |
|
|
if self.AtEnd():
|
|
|
raise AssertionError('EOF')
|
|
|
|
|
|
re_list = self.lexer_def[lex_mode]
|
|
|
|
|
|
end_index, tok_type, tok_val = FindLongestMatch(
|
|
|
re_list, self.line, self.line_pos)
|
|
|
tok_type, end_pos = self.match_func(lex_mode, self.line, self.line_pos)
|
|
|
tok_val = self.line[self.line_pos:end_pos]
|
|
|
|
|
|
# NOTE: tok_val is redundant, but even in osh.asdl we have some separation
|
|
|
# between data needed for formatting and data needed for execution. Could
|
|
|
@@ -178,7 +138,7 @@ def Read(self, lex_mode): |
|
|
|
|
|
t = ast.token(tok_type, tok_val, span_id)
|
|
|
|
|
|
self.line_pos = end_index
|
|
|
self.line_pos = end_pos
|
|
|
return t
|
|
|
|
|
|
|
|
|
|
0 comments on commit
529aaf2