Skip to content

Commit

Permalink
[pgen2] Start hooking up the Oil lexer to pgen2.
Browse files Browse the repository at this point in the history
Also rename and prettify some pgen2 source code.  Ran opy/regtest.sh
{compile-all,verify-golden}.
  • Loading branch information
Andy Chu committed May 22, 2019
1 parent 1fb13e9 commit 37098df
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 77 deletions.
38 changes: 17 additions & 21 deletions opy/pgen2/parse.py
Expand Up @@ -7,23 +7,22 @@
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""

# Local imports
from . import token

class ParseError(Exception):
"""Exception to signal the parser is stuck."""

def __init__(self, msg, type, value, context):
def __init__(self, msg, typ, value, context):
Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
(msg, type, value, context))
(msg, typ, value, context))
self.msg = msg
self.type = type
self.type = typ
self.value = value
self.context = context


class Parser(object):
"""Parser engine.
Expand Down Expand Up @@ -51,7 +50,6 @@ class Parser(object):
the ParseError exception. There is no error recovery; the parser
cannot be used after a syntax error was reported (but it can be
reinitialized by calling setup()).
"""

def __init__(self, grammar, convert=None):
Expand Down Expand Up @@ -81,7 +79,6 @@ def __init__(self, grammar, convert=None):
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
self.convert = convert or (lambda grammar, node: node)
Expand All @@ -97,7 +94,6 @@ def setup(self, start=None):
You can use a Parser instance to parse any number of programs;
each time you call setup() the parser is reset to an initial
state determined by the (implicit or explicit) start symbol.
"""
if start is None:
start = self.grammar.start
Expand All @@ -110,10 +106,10 @@ def setup(self, start=None):
self.rootnode = None
self.used_names = set() # Aliased to self.rootnode.used_names in pop()

def addtoken(self, type, value, context):
def addtoken(self, typ, value, context):
"""Add a token; return True iff this is the end of the program."""
# Map from token to label
ilabel = self.classify(type, value, context)
ilabel = self.classify(typ, value, context)
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
Expand All @@ -126,7 +122,7 @@ def addtoken(self, type, value, context):
# Look it up in the list of labels
assert t < 256
# Shift a token; we're done with it
self.shift(type, value, newstate, context)
self.shift(typ, value, newstate, context)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
Expand All @@ -153,38 +149,38 @@ def addtoken(self, type, value, context):
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input",
type, value, context)
typ, value, context)
else:
# No success finding a transition
raise ParseError("bad input", type, value, context)
raise ParseError("bad input", typ, value, context)

def classify(self, type, value, context):
def classify(self, typ, value, context):
"""Turn a token into a label. (Internal)"""
if type == token.NAME:
if typ == token.NAME:
# Keep a listing of all used names
self.used_names.add(value)
# Check for reserved words
ilabel = self.grammar.keywords.get(value)
if ilabel is not None:
return ilabel
ilabel = self.grammar.tokens.get(type)
ilabel = self.grammar.tokens.get(typ)
if ilabel is None:
raise ParseError("bad token", type, value, context)
raise ParseError("bad token", typ, value, context)
return ilabel

def shift(self, type, value, newstate, context):
def shift(self, typ, value, newstate, context):
"""Shift a token. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, value, context, None)
newnode = (typ, value, context, None)
newnode = self.convert(self.grammar, newnode)
if newnode is not None:
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)

def push(self, type, newdfa, newstate, context):
def push(self, typ, newdfa, newstate, context):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, None, context, [])
newnode = (typ, None, context, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))

Expand Down
70 changes: 33 additions & 37 deletions opy/pgen2/pgen.py
Expand Up @@ -6,9 +6,8 @@
#import grammar, token, tokenize
# NOTE: Need these special versions of token/tokenize for BACKQUOTE and such.
from . import grammar, token, tokenize
from core.util import log

class PgenGrammar(grammar.Grammar):
pass

class ParserGenerator(object):

Expand All @@ -28,83 +27,83 @@ def __init__(self, filename, stream=None):
self.addfirstsets()

def make_grammar(self):
c = PgenGrammar()
gr = grammar.Grammar()
names = list(self.dfas.keys())
names.sort()
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[name] = i
c.number2symbol[i] = name
i = 256 + len(gr.symbol2number)
gr.symbol2number[name] = i
gr.number2symbol[i] = name
for name in names:
dfa = self.dfas[name]
states = []
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next)))
for label, next_ in sorted(state.arcs.items()):
arcs.append((self.make_label(gr, label), dfa.index(next_)))
if state.isfinal:
arcs.append((0, dfa.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
c.start = c.symbol2number[self.startsymbol]
return c
gr.states.append(states)
gr.dfas[gr.symbol2number[name]] = (states, self.make_first(gr, name))
gr.start = gr.symbol2number[self.startsymbol]
return gr

def make_first(self, c, name):
def make_first(self, gr, name):
rawfirst = self.first[name]
first = {}
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
ilabel = self.make_label(gr, label)
##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1
return first

def make_label(self, c, label):
def make_label(self, gr, label):
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(c.labels)
ilabel = len(gr.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in c.symbol2number:
if label in gr.symbol2number:
# A symbol name (a non-terminal)
if label in c.symbol2label:
return c.symbol2label[label]
if label in gr.symbol2label:
return gr.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
gr.labels.append((gr.symbol2number[label], None))
gr.symbol2label[label] = ilabel
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = getattr(token, label, None)
assert isinstance(itoken, int), label
assert itoken in token.tok_name, label
if itoken in c.tokens:
return c.tokens[itoken]
if itoken in gr.tokens:
return gr.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
gr.labels.append((itoken, None))
gr.tokens[itoken] = ilabel
return ilabel
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
value = eval(label)
if value[0].isalpha():
# A keyword
if value in c.keywords:
return c.keywords[value]
if value in gr.keywords:
return gr.keywords[value]
else:
c.labels.append((token.NAME, value))
c.keywords[value] = ilabel
gr.labels.append((token.NAME, value))
gr.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = grammar.opmap[value] # Fails if unknown token
if itoken in c.tokens:
return c.tokens[itoken]
if itoken in gr.tokens:
return gr.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
gr.labels.append((itoken, None))
gr.tokens[itoken] = ilabel
return ilabel

def addfirstsets(self):
Expand Down Expand Up @@ -330,10 +329,7 @@ def gettoken(self):

def raise_error(self, msg, *args):
if args:
try:
msg = msg % args
except:
msg = " ".join([msg] + list(map(str, args)))
msg = msg % args
raise SyntaxError(msg, (self.filename, self.end[0],
self.end[1], self.line))

Expand Down
11 changes: 6 additions & 5 deletions pgen2/pgen2-test.sh
Expand Up @@ -54,13 +54,15 @@
# - lex_mode_e.Block -- newlines are terminators
# - lex_mode_e.CharClass -- regex char classes have different rules
# (outer regexes use Expr mode, I believe)
# - lex_mode_e.TypeExpr -- because I hit the >> problem!
# >> is not an operator in type expressions
# - lex_mode_e.Str # simple double-quoted string literal?
# # I don't want all the mess
# # or you can post-process the LST and eliminate
# # undesirable shellc onstructs
#
# Extensions to pgen:
# - take tokens from a different lexer
# - take tokens from a different lexer -- see NOTES-pgen2.txt for syntax ideas
# - callbacks to invoke the parser
# - hm actually the "driver" can do this because it sees all the tokens?
# - it's pushing rather than pulling.
Expand Down Expand Up @@ -171,9 +173,9 @@ calc-test() {
'a + 2'
'1 + 2*3/4' # operator precedence and left assoc
'"abc" + "def"'
#'2 ** 3 ** 4' # right assoc
#'f(1, 2, 3)'
#'f(a[i], 2, 3)'
'2 ** 3 ** 4' # right assoc
'f(1, 2, 3)'
'f(a[i], 2, 3)'

# bad token
'a * 3&4'
Expand Down Expand Up @@ -209,7 +211,6 @@ ll1-test() {
done
}


all() {
banner 'exprs'
parse-exprs
Expand Down

0 comments on commit 37098df

Please sign in to comment.