[pgen2] Start hooking up the Oil lexer to pgen2.

Also rename and prettify some pgen2 source code. Ran opy/regtest.sh {compile-all,verify-golden}.
oilshell · May 22, 2019 · 37098df · 37098df
1 parent 1fb13e9
commit 37098df
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 77 deletions.
diff --git a/opy/pgen2/parse.py b/opy/pgen2/parse.py
@@ -7,23 +7,22 @@
 
 See Parser/parser.c in the Python distribution for additional info on
 how this parsing engine works.
-
 """
 
-# Local imports
 from . import token
 
 class ParseError(Exception):
     """Exception to signal the parser is stuck."""
 
-    def __init__(self, msg, type, value, context):
+    def __init__(self, msg, typ, value, context):
         Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
-                           (msg, type, value, context))
+                           (msg, typ, value, context))
         self.msg = msg
-        self.type = type
+        self.type = typ
         self.value = value
         self.context = context
 
+
 class Parser(object):
     """Parser engine.
 
@@ -51,7 +50,6 @@ class Parser(object):
     the ParseError exception.  There is no error recovery; the parser
     cannot be used after a syntax error was reported (but it can be
     reinitialized by calling setup()).
-
     """
 
     def __init__(self, grammar, convert=None):
@@ -81,7 +79,6 @@ def __init__(self, grammar, convert=None):
 
         An abstract syntax tree node may be anything; this is entirely
         up to the converter function.
-
         """
         self.grammar = grammar
         self.convert = convert or (lambda grammar, node: node)
@@ -97,7 +94,6 @@ def setup(self, start=None):
         You can use a Parser instance to parse any number of programs;
         each time you call setup() the parser is reset to an initial
         state determined by the (implicit or explicit) start symbol.
-
         """
         if start is None:
             start = self.grammar.start
@@ -110,10 +106,10 @@ def setup(self, start=None):
         self.rootnode = None
         self.used_names = set() # Aliased to self.rootnode.used_names in pop()
 
-    def addtoken(self, type, value, context):
+    def addtoken(self, typ, value, context):
         """Add a token; return True iff this is the end of the program."""
         # Map from token to label
-        ilabel = self.classify(type, value, context)
+        ilabel = self.classify(typ, value, context)
         # Loop until the token is shifted; may raise exceptions
         while True:
             dfa, state, node = self.stack[-1]
@@ -126,7 +122,7 @@ def addtoken(self, type, value, context):
                     # Look it up in the list of labels
                     assert t < 256
                     # Shift a token; we're done with it
-                    self.shift(type, value, newstate, context)
+                    self.shift(typ, value, newstate, context)
                     # Pop while we are in an accept-only state
                     state = newstate
                     while states[state] == [(0, state)]:
@@ -153,38 +149,38 @@ def addtoken(self, type, value, context):
                     if not self.stack:
                         # Done parsing, but another token is input
                         raise ParseError("too much input",
-                                         type, value, context)
+                                         typ, value, context)
                 else:
                     # No success finding a transition
-                    raise ParseError("bad input", type, value, context)
+                    raise ParseError("bad input", typ, value, context)
 
-    def classify(self, type, value, context):
+    def classify(self, typ, value, context):
         """Turn a token into a label.  (Internal)"""
-        if type == token.NAME:
+        if typ == token.NAME:
             # Keep a listing of all used names
             self.used_names.add(value)
             # Check for reserved words
             ilabel = self.grammar.keywords.get(value)
             if ilabel is not None:
                 return ilabel
-        ilabel = self.grammar.tokens.get(type)
+        ilabel = self.grammar.tokens.get(typ)
         if ilabel is None:
-            raise ParseError("bad token", type, value, context)
+            raise ParseError("bad token", typ, value, context)
         return ilabel
 
-    def shift(self, type, value, newstate, context):
+    def shift(self, typ, value, newstate, context):
         """Shift a token.  (Internal)"""
         dfa, state, node = self.stack[-1]
-        newnode = (type, value, context, None)
+        newnode = (typ, value, context, None)
         newnode = self.convert(self.grammar, newnode)
         if newnode is not None:
             node[-1].append(newnode)
         self.stack[-1] = (dfa, newstate, node)
 
-    def push(self, type, newdfa, newstate, context):
+    def push(self, typ, newdfa, newstate, context):
         """Push a nonterminal.  (Internal)"""
         dfa, state, node = self.stack[-1]
-        newnode = (type, None, context, [])
+        newnode = (typ, None, context, [])
         self.stack[-1] = (dfa, newstate, node)
         self.stack.append((newdfa, 0, newnode))
 

diff --git a/opy/pgen2/pgen.py b/opy/pgen2/pgen.py
@@ -6,9 +6,8 @@
 #import grammar, token, tokenize
 # NOTE: Need these special versions of token/tokenize for BACKQUOTE and such.
 from . import grammar, token, tokenize
+from core.util import log
 
-class PgenGrammar(grammar.Grammar):
-    pass
 
 class ParserGenerator(object):
 
@@ -28,83 +27,83 @@ def __init__(self, filename, stream=None):
         self.addfirstsets()
 
     def make_grammar(self):
-        c = PgenGrammar()
+        gr = grammar.Grammar()
         names = list(self.dfas.keys())
         names.sort()
         names.remove(self.startsymbol)
         names.insert(0, self.startsymbol)
         for name in names:
-            i = 256 + len(c.symbol2number)
-            c.symbol2number[name] = i
-            c.number2symbol[i] = name
+            i = 256 + len(gr.symbol2number)
+            gr.symbol2number[name] = i
+            gr.number2symbol[i] = name
         for name in names:
             dfa = self.dfas[name]
             states = []
             for state in dfa:
                 arcs = []
-                for label, next in sorted(state.arcs.items()):
-                    arcs.append((self.make_label(c, label), dfa.index(next)))
+                for label, next_ in sorted(state.arcs.items()):
+                    arcs.append((self.make_label(gr, label), dfa.index(next_)))
                 if state.isfinal:
                     arcs.append((0, dfa.index(state)))
                 states.append(arcs)
-            c.states.append(states)
-            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
-        c.start = c.symbol2number[self.startsymbol]
-        return c
+            gr.states.append(states)
+            gr.dfas[gr.symbol2number[name]] = (states, self.make_first(gr, name))
+        gr.start = gr.symbol2number[self.startsymbol]
+        return gr
 
-    def make_first(self, c, name):
+    def make_first(self, gr, name):
         rawfirst = self.first[name]
         first = {}
         for label in sorted(rawfirst):
-            ilabel = self.make_label(c, label)
+            ilabel = self.make_label(gr, label)
             ##assert ilabel not in first # XXX failed on <> ... !=
             first[ilabel] = 1
         return first
 
-    def make_label(self, c, label):
+    def make_label(self, gr, label):
         # XXX Maybe this should be a method on a subclass of converter?
-        ilabel = len(c.labels)
+        ilabel = len(gr.labels)
         if label[0].isalpha():
             # Either a symbol name or a named token
-            if label in c.symbol2number:
+            if label in gr.symbol2number:
                 # A symbol name (a non-terminal)
-                if label in c.symbol2label:
-                    return c.symbol2label[label]
+                if label in gr.symbol2label:
+                    return gr.symbol2label[label]
                 else:
-                    c.labels.append((c.symbol2number[label], None))
-                    c.symbol2label[label] = ilabel
+                    gr.labels.append((gr.symbol2number[label], None))
+                    gr.symbol2label[label] = ilabel
                     return ilabel
             else:
                 # A named token (NAME, NUMBER, STRING)
                 itoken = getattr(token, label, None)
                 assert isinstance(itoken, int), label
                 assert itoken in token.tok_name, label
-                if itoken in c.tokens:
-                    return c.tokens[itoken]
+                if itoken in gr.tokens:
+                    return gr.tokens[itoken]
                 else:
-                    c.labels.append((itoken, None))
-                    c.tokens[itoken] = ilabel
+                    gr.labels.append((itoken, None))
+                    gr.tokens[itoken] = ilabel
                     return ilabel
         else:
             # Either a keyword or an operator
             assert label[0] in ('"', "'"), label
             value = eval(label)
             if value[0].isalpha():
                 # A keyword
-                if value in c.keywords:
-                    return c.keywords[value]
+                if value in gr.keywords:
+                    return gr.keywords[value]
                 else:
-                    c.labels.append((token.NAME, value))
-                    c.keywords[value] = ilabel
+                    gr.labels.append((token.NAME, value))
+                    gr.keywords[value] = ilabel
                     return ilabel
             else:
                 # An operator (any non-numeric token)
                 itoken = grammar.opmap[value] # Fails if unknown token
-                if itoken in c.tokens:
-                    return c.tokens[itoken]
+                if itoken in gr.tokens:
+                    return gr.tokens[itoken]
                 else:
-                    c.labels.append((itoken, None))
-                    c.tokens[itoken] = ilabel
+                    gr.labels.append((itoken, None))
+                    gr.tokens[itoken] = ilabel
                     return ilabel
 
     def addfirstsets(self):
@@ -330,10 +329,7 @@ def gettoken(self):
 
     def raise_error(self, msg, *args):
         if args:
-            try:
-                msg = msg % args
-            except:
-                msg = " ".join([msg] + list(map(str, args)))
+            msg = msg % args
         raise SyntaxError(msg, (self.filename, self.end[0],
                                 self.end[1], self.line))
 

diff --git a/pgen2/pgen2-test.sh b/pgen2/pgen2-test.sh
@@ -54,13 +54,15 @@
 # - lex_mode_e.Block -- newlines are terminators
 # - lex_mode_e.CharClass -- regex char classes have different rules
 #   (outer regexes use Expr mode, I believe)
+# - lex_mode_e.TypeExpr -- because I hit the >> problem! 
+#                          >> is not an operator in type expressions
 # - lex_mode_e.Str    # simple double-quoted string literal?
 #                     # I don't want all the mess
 #                     # or you can post-process the LST and eliminate
 #                     # undesirable shellc onstructs
 #
 # Extensions to pgen:
-# - take tokens from a different lexer
+# - take tokens from a different lexer -- see NOTES-pgen2.txt for syntax ideas
 # - callbacks to invoke the parser
 #   - hm actually the "driver" can do this because it sees all the tokens?
 #   - it's pushing rather than pulling.
@@ -171,9 +173,9 @@ calc-test() {
     'a + 2'
     '1 + 2*3/4'  # operator precedence and left assoc
     '"abc" + "def"'
-    #'2 ** 3 ** 4'  # right assoc
-    #'f(1, 2, 3)'
-    #'f(a[i], 2, 3)'
+    '2 ** 3 ** 4'  # right assoc
+    'f(1, 2, 3)'
+    'f(a[i], 2, 3)'
 
     # bad token
     'a * 3&4'
@@ -209,7 +211,6 @@ ll1-test() {
   done
 }
 
-
 all() {
   banner 'exprs'
   parse-exprs