Permalink
Browse files

Move SimpleLexer to osh/match.py.

Some other renamings.

Preparing for lowering the 'echo -e' lexer into re2c.
  • Loading branch information...
Andy Chu
Andy Chu committed May 29, 2018
1 parent 56ea936 commit b3e6cb7f76874286601809fa37998ef1f6698add
Showing with 58 additions and 50 deletions.
  1. +1 −2 core/builtin.py
  2. +0 −39 core/lexer.py
  3. +3 −3 osh/lex_test.py
  4. +54 −6 osh/match.py
View
@@ -30,7 +30,6 @@
import sys
from core import args
from core import lexer
from core import util
from core import state
from core import word_compile
@@ -161,7 +160,7 @@ def Resolve(argv0):
# Implementation of builtins.
#
ECHO_LEXER = lexer.SimpleLexer(lex.ECHO_E_DEF)
ECHO_LEXER = match.SimpleLexer(lex.ECHO_E_DEF)
ECHO_SPEC = _Register('echo')
ECHO_SPEC.ShortFlag('-e') # no backslash escapes
View
@@ -9,9 +9,6 @@
lexer.py - Library for lexing.
"""
# CompileAll() used for SimpleLexer
import re
from asdl import const
from core import util
from osh.meta import Id
@@ -30,15 +27,6 @@ def R(pat, tok_type):
return (True, pat, tok_type)
def CompileAll(pat_list):
result = []
for is_regex, pat, token_id in pat_list:
if not is_regex:
pat = re.escape(pat) # turn $ into \$
result.append((re.compile(pat), token_id))
return result
class LineLexer(object):
def __init__(self, match_func, line, arena):
# Compile all regexes
@@ -234,30 +222,3 @@ def Read(self, lex_mode):
#log('Read() Returning %s', t)
return t
class SimpleLexer(object):
"""Lexer for echo -e, which interprets C-escaped strings.
Based on osh/parse_lib.py MatchToken_Slow.
"""
def __init__(self, pat_list):
self.pat_list = CompileAll(pat_list)
def Tokens(self, line):
"""Yields tokens."""
pos = 0
n = len(line)
while pos < n:
matches = []
for regex, tok_type in self.pat_list:
m = regex.match(line, pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError(
'no match at position %d: %r (%r)' % (pos, line, line[pos]))
# NOTE: Need longest-match semantics to find \377 vs \.
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
yield tok_type, line[pos:end_pos]
pos = end_pos
View
@@ -6,7 +6,7 @@
import unittest
from core.lexer import CompileAll, LineLexer
from core.lexer import LineLexer
from core import test_lib
from osh import match
@@ -227,8 +227,8 @@ def testLookAhead(self):
ast.token(Id.Op_LParen, '('), l.LookAhead(lex_mode_e.OUTER))
OUTER_RE = CompileAll(LEXER_DEF[lex_mode_e.OUTER])
DOUBLE_QUOTED_RE = CompileAll(LEXER_DEF[lex_mode_e.DQ])
OUTER_RE = match.CompileAll(LEXER_DEF[lex_mode_e.OUTER])
DOUBLE_QUOTED_RE = match.CompileAll(LEXER_DEF[lex_mode_e.DQ])
class RegexTest(unittest.TestCase):
View
@@ -15,12 +15,12 @@
fastlex = None
class _MatchToken_Slow(object):
class _MatchOshToken_Slow(object):
"""An abstract matcher that doesn't depend on OSH."""
def __init__(self, lexer_def):
self.lexer_def = {}
for state, pat_list in lexer_def.items():
self.lexer_def[state] = lexer.CompileAll(pat_list)
for lex_mode, pat_list in lexer_def.items():
self.lexer_def[lex_mode] = lexer.CompileAll(pat_list)
def __call__(self, lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
@@ -40,19 +40,67 @@ def __call__(self, lex_mode, line, start_pos):
return tok_type, end_pos
def _MatchToken_Fast(lex_mode, line, start_pos):
def _MatchOshToken_Fast(lex_mode, line, start_pos):
"""Returns (id, end_pos)."""
tok_type, end_pos = fastlex.MatchToken(lex_mode.enum_id, line, start_pos)
# IMPORTANT: We're reusing Id instances here. Ids are very common, so this
# saves memory.
return IdInstance(tok_type), end_pos
# CompileAll() used for SimpleLexer
import re
def CompileAll(pat_list):
result = []
for is_regex, pat, token_id in pat_list:
if not is_regex:
pat = re.escape(pat) # turn $ into \$
result.append((re.compile(pat), token_id))
return result
# TODO:
#
# MatchToken -> MatchOshToken()
# Add MatchEchoToken()
#
# MATCHER -> OSH_MATCHER
# add ECHO_MATCHER
# TODO: Make this the slow path!
class SimpleLexer(object):
"""Lexer for echo -e, which interprets C-escaped strings.
Based on osh/parse_lib.py MatchToken_Slow.
"""
def __init__(self, pat_list):
self.pat_list = CompileAll(pat_list)
def Tokens(self, line):
"""Yields tokens."""
pos = 0
n = len(line)
while pos < n:
matches = []
for regex, tok_type in self.pat_list:
m = regex.match(line, pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError(
'no match at position %d: %r (%r)' % (pos, line, line[pos]))
# NOTE: Need longest-match semantics to find \377 vs \.
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
yield tok_type, line[pos:end_pos]
pos = end_pos
if fastlex:
MATCHER = _MatchToken_Fast
MATCHER = _MatchOshToken_Fast
IsValidVarName = fastlex.IsValidVarName
else:
MATCHER = _MatchToken_Slow(lex.LEXER_DEF)
MATCHER = _MatchOshToken_Slow(lex.LEXER_DEF)
# Used by osh/cmd_parse.py to validate for loop name. Note it must be
# anchored on the right.

0 comments on commit b3e6cb7

Please sign in to comment.