| @@ -0,0 +1,207 @@ | ||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||
| # Licensed to PSF under a Contributor Agreement. | ||
| """This module defines the data structures used to represent a grammar. | ||
| These are a bit arcane because they are derived from the data | ||
| structures used by Python's 'pgen' parser generator. | ||
| There's also a table here mapping operators to their names in the | ||
| token module; the Python tokenize module reports all operators as the | ||
| fallback token code OP, but the parser needs the actual token code. | ||
| """ | ||
| # Python imports | ||
| import collections | ||
| import pickle | ||
| # Local imports | ||
| from . import token, tokenize | ||
| class Grammar(object): | ||
| """Pgen parsing tables conversion class. | ||
| Once initialized, this class supplies the grammar tables for the | ||
| parsing engine implemented by parse.py. The parsing engine | ||
| accesses the instance variables directly. The class here does not | ||
| provide initialization of the tables; several subclasses exist to | ||
| do this (see the conv and pgen modules). | ||
| The load() method reads the tables from a pickle file, which is | ||
| much faster than the other ways offered by subclasses. The pickle | ||
| file is written by calling dump() (after loading the grammar | ||
| tables using a subclass). The report() method prints a readable | ||
| representation of the tables to stdout, for debugging. | ||
| The instance variables are as follows: | ||
| symbol2number -- a dict mapping symbol names to numbers. Symbol | ||
| numbers are always 256 or higher, to distinguish | ||
| them from token numbers, which are between 0 and | ||
| 255 (inclusive). | ||
| number2symbol -- a dict mapping numbers to symbol names; | ||
| these two are each other's inverse. | ||
| states -- a list of DFAs, where each DFA is a list of | ||
| states, each state is a list of arcs, and each | ||
| arc is a (i, j) pair where i is a label and j is | ||
| a state number. The DFA number is the index into | ||
| this list. (This name is slightly confusing.) | ||
| Final states are represented by a special arc of | ||
| the form (0, j) where j is its own state number. | ||
| dfas -- a dict mapping symbol numbers to (DFA, first) | ||
| pairs, where DFA is an item from the states list | ||
| above, and first is a set of tokens that can | ||
| begin this grammar rule (represented by a dict | ||
| whose values are always 1). | ||
| labels -- a list of (x, y) pairs where x is either a token | ||
| number or a symbol number, and y is either None | ||
| or a string; the strings are keywords. The label | ||
| number is the index in this list; label numbers | ||
| are used to mark state transitions (arcs) in the | ||
| DFAs. | ||
| start -- the number of the grammar's start symbol. | ||
| keywords -- a dict mapping keyword strings to arc labels. | ||
| tokens -- a dict mapping token numbers to arc labels. | ||
| """ | ||
| def __init__(self): | ||
| self.symbol2number = {} | ||
| self.number2symbol = {} | ||
| self.states = [] | ||
| self.dfas = {} | ||
| self.labels = [(0, "EMPTY")] | ||
| self.keywords = {} | ||
| self.tokens = {} | ||
| self.symbol2label = {} | ||
| self.start = 256 | ||
| def dump(self, filename): | ||
| """Dump the grammar tables to a pickle file. | ||
| dump() recursively changes all dict to OrderedDict, so the pickled file | ||
| is not exactly the same as what was passed in to dump(). load() uses the | ||
| pickled file to create the tables, but only changes OrderedDict to dict | ||
| at the top level; it does not recursively change OrderedDict to dict. | ||
| So, the loaded tables are different from the original tables that were | ||
| passed to load() in that some of the OrderedDict (from the pickled file) | ||
| are not changed back to dict. For parsing, this has no effect on | ||
| performance because OrderedDict uses dict's __getitem__ with nothing in | ||
| between. | ||
| """ | ||
| with open(filename, "wb") as f: | ||
| d = _make_deterministic(self.__dict__) | ||
| pickle.dump(d, f, 2) | ||
| def load(self, filename): | ||
| """Load the grammar tables from a pickle file.""" | ||
| with open(filename, "rb") as f: | ||
| d = pickle.load(f) | ||
| self.__dict__.update(d) | ||
| def copy(self): | ||
| """ | ||
| Copy the grammar. | ||
| """ | ||
| new = self.__class__() | ||
| for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", | ||
| "tokens", "symbol2label"): | ||
| setattr(new, dict_attr, getattr(self, dict_attr).copy()) | ||
| new.labels = self.labels[:] | ||
| new.states = self.states[:] | ||
| new.start = self.start | ||
| return new | ||
| def report(self): | ||
| """Dump the grammar tables to standard output, for debugging.""" | ||
| from pprint import pprint | ||
| print("s2n") | ||
| pprint(self.symbol2number) | ||
| print("n2s") | ||
| pprint(self.number2symbol) | ||
| print("states") | ||
| pprint(self.states) | ||
| print("dfas") | ||
| pprint(self.dfas) | ||
| print("labels") | ||
| pprint(self.labels) | ||
| print("start", self.start) | ||
| def _make_deterministic(top): | ||
| if isinstance(top, dict): | ||
| return collections.OrderedDict( | ||
| sorted(((k, _make_deterministic(v)) for k, v in top.items()))) | ||
| if isinstance(top, list): | ||
| return [_make_deterministic(e) for e in top] | ||
| if isinstance(top, tuple): | ||
| return tuple(_make_deterministic(e) for e in top) | ||
| return top | ||
| # Map from operator to number (since tokenize doesn't do this) | ||
| opmap_raw = """ | ||
| ( LPAR | ||
| ) RPAR | ||
| [ LSQB | ||
| ] RSQB | ||
| : COLON | ||
| , COMMA | ||
| ; SEMI | ||
| + PLUS | ||
| - MINUS | ||
| * STAR | ||
| / SLASH | ||
| | VBAR | ||
| & AMPER | ||
| < LESS | ||
| > GREATER | ||
| = EQUAL | ||
| . DOT | ||
| % PERCENT | ||
| ` BACKQUOTE | ||
| { LBRACE | ||
| } RBRACE | ||
| @ AT | ||
| @= ATEQUAL | ||
| == EQEQUAL | ||
| != NOTEQUAL | ||
| <> NOTEQUAL | ||
| <= LESSEQUAL | ||
| >= GREATEREQUAL | ||
| ~ TILDE | ||
| ^ CIRCUMFLEX | ||
| << LEFTSHIFT | ||
| >> RIGHTSHIFT | ||
| ** DOUBLESTAR | ||
| += PLUSEQUAL | ||
| -= MINEQUAL | ||
| *= STAREQUAL | ||
| /= SLASHEQUAL | ||
| %= PERCENTEQUAL | ||
| &= AMPEREQUAL | ||
| |= VBAREQUAL | ||
| ^= CIRCUMFLEXEQUAL | ||
| <<= LEFTSHIFTEQUAL | ||
| >>= RIGHTSHIFTEQUAL | ||
| **= DOUBLESTAREQUAL | ||
| // DOUBLESLASH | ||
| //= DOUBLESLASHEQUAL | ||
| -> RARROW | ||
| """ | ||
| opmap = {} | ||
| for line in opmap_raw.splitlines(): | ||
| if line: | ||
| op, name = line.split() | ||
| opmap[op] = getattr(token, name) |
| @@ -0,0 +1,201 @@ | ||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||
| # Licensed to PSF under a Contributor Agreement. | ||
| """Parser engine for the grammar tables generated by pgen. | ||
| The grammar table must be loaded first. | ||
| See Parser/parser.c in the Python distribution for additional info on | ||
| how this parsing engine works. | ||
| """ | ||
| # Local imports | ||
| from . import token | ||
| class ParseError(Exception): | ||
| """Exception to signal the parser is stuck.""" | ||
| def __init__(self, msg, type, value, context): | ||
| Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % | ||
| (msg, type, value, context)) | ||
| self.msg = msg | ||
| self.type = type | ||
| self.value = value | ||
| self.context = context | ||
| class Parser(object): | ||
| """Parser engine. | ||
| The proper usage sequence is: | ||
| p = Parser(grammar, [converter]) # create instance | ||
| p.setup([start]) # prepare for parsing | ||
| <for each input token>: | ||
| if p.addtoken(...): # parse a token; may raise ParseError | ||
| break | ||
| root = p.rootnode # root of abstract syntax tree | ||
| A Parser instance may be reused by calling setup() repeatedly. | ||
| A Parser instance contains state pertaining to the current token | ||
| sequence, and should not be used concurrently by different threads | ||
| to parse separate token sequences. | ||
| See driver.py for how to get input tokens by tokenizing a file or | ||
| string. | ||
| Parsing is complete when addtoken() returns True; the root of the | ||
| abstract syntax tree can then be retrieved from the rootnode | ||
| instance variable. When a syntax error occurs, addtoken() raises | ||
| the ParseError exception. There is no error recovery; the parser | ||
| cannot be used after a syntax error was reported (but it can be | ||
| reinitialized by calling setup()). | ||
| """ | ||
| def __init__(self, grammar, convert=None): | ||
| """Constructor. | ||
| The grammar argument is a grammar.Grammar instance; see the | ||
| grammar module for more information. | ||
| The parser is not ready yet for parsing; you must call the | ||
| setup() method to get it started. | ||
| The optional convert argument is a function mapping concrete | ||
| syntax tree nodes to abstract syntax tree nodes. If not | ||
| given, no conversion is done and the syntax tree produced is | ||
| the concrete syntax tree. If given, it must be a function of | ||
| two arguments, the first being the grammar (a grammar.Grammar | ||
| instance), and the second being the concrete syntax tree node | ||
| to be converted. The syntax tree is converted from the bottom | ||
| up. | ||
| A concrete syntax tree node is a (type, value, context, nodes) | ||
| tuple, where type is the node type (a token or symbol number), | ||
| value is None for symbols and a string for tokens, context is | ||
| None or an opaque value used for error reporting (typically a | ||
| (lineno, offset) pair), and nodes is a list of children for | ||
| symbols, and None for tokens. | ||
| An abstract syntax tree node may be anything; this is entirely | ||
| up to the converter function. | ||
| """ | ||
| self.grammar = grammar | ||
| self.convert = convert or (lambda grammar, node: node) | ||
| def setup(self, start=None): | ||
| """Prepare for parsing. | ||
| This *must* be called before starting to parse. | ||
| The optional argument is an alternative start symbol; it | ||
| defaults to the grammar's start symbol. | ||
| You can use a Parser instance to parse any number of programs; | ||
| each time you call setup() the parser is reset to an initial | ||
| state determined by the (implicit or explicit) start symbol. | ||
| """ | ||
| if start is None: | ||
| start = self.grammar.start | ||
| # Each stack entry is a tuple: (dfa, state, node). | ||
| # A node is a tuple: (type, value, context, children), | ||
| # where children is a list of nodes or None, and context may be None. | ||
| newnode = (start, None, None, []) | ||
| stackentry = (self.grammar.dfas[start], 0, newnode) | ||
| self.stack = [stackentry] | ||
| self.rootnode = None | ||
| self.used_names = set() # Aliased to self.rootnode.used_names in pop() | ||
| def addtoken(self, type, value, context): | ||
| """Add a token; return True iff this is the end of the program.""" | ||
| # Map from token to label | ||
| ilabel = self.classify(type, value, context) | ||
| # Loop until the token is shifted; may raise exceptions | ||
| while True: | ||
| dfa, state, node = self.stack[-1] | ||
| states, first = dfa | ||
| arcs = states[state] | ||
| # Look for a state with this label | ||
| for i, newstate in arcs: | ||
| t, v = self.grammar.labels[i] | ||
| if ilabel == i: | ||
| # Look it up in the list of labels | ||
| assert t < 256 | ||
| # Shift a token; we're done with it | ||
| self.shift(type, value, newstate, context) | ||
| # Pop while we are in an accept-only state | ||
| state = newstate | ||
| while states[state] == [(0, state)]: | ||
| self.pop() | ||
| if not self.stack: | ||
| # Done parsing! | ||
| return True | ||
| dfa, state, node = self.stack[-1] | ||
| states, first = dfa | ||
| # Done with this token | ||
| return False | ||
| elif t >= 256: | ||
| # See if it's a symbol and if we're in its first set | ||
| itsdfa = self.grammar.dfas[t] | ||
| itsstates, itsfirst = itsdfa | ||
| if ilabel in itsfirst: | ||
| # Push a symbol | ||
| self.push(t, self.grammar.dfas[t], newstate, context) | ||
| break # To continue the outer while loop | ||
| else: | ||
| if (0, state) in arcs: | ||
| # An accepting state, pop it and try something else | ||
| self.pop() | ||
| if not self.stack: | ||
| # Done parsing, but another token is input | ||
| raise ParseError("too much input", | ||
| type, value, context) | ||
| else: | ||
| # No success finding a transition | ||
| raise ParseError("bad input", type, value, context) | ||
| def classify(self, type, value, context): | ||
| """Turn a token into a label. (Internal)""" | ||
| if type == token.NAME: | ||
| # Keep a listing of all used names | ||
| self.used_names.add(value) | ||
| # Check for reserved words | ||
| ilabel = self.grammar.keywords.get(value) | ||
| if ilabel is not None: | ||
| return ilabel | ||
| ilabel = self.grammar.tokens.get(type) | ||
| if ilabel is None: | ||
| raise ParseError("bad token", type, value, context) | ||
| return ilabel | ||
| def shift(self, type, value, newstate, context): | ||
| """Shift a token. (Internal)""" | ||
| dfa, state, node = self.stack[-1] | ||
| newnode = (type, value, context, None) | ||
| newnode = self.convert(self.grammar, newnode) | ||
| if newnode is not None: | ||
| node[-1].append(newnode) | ||
| self.stack[-1] = (dfa, newstate, node) | ||
| def push(self, type, newdfa, newstate, context): | ||
| """Push a nonterminal. (Internal)""" | ||
| dfa, state, node = self.stack[-1] | ||
| newnode = (type, None, context, []) | ||
| self.stack[-1] = (dfa, newstate, node) | ||
| self.stack.append((newdfa, 0, newnode)) | ||
| def pop(self): | ||
| """Pop a nonterminal. (Internal)""" | ||
| popdfa, popstate, popnode = self.stack.pop() | ||
| newnode = self.convert(self.grammar, popnode) | ||
| if newnode is not None: | ||
| if self.stack: | ||
| dfa, state, node = self.stack[-1] | ||
| node[-1].append(newnode) | ||
| else: | ||
| self.rootnode = newnode | ||
| self.rootnode.used_names = self.used_names |
| @@ -0,0 +1,386 @@ | ||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||
| # Licensed to PSF under a Contributor Agreement. | ||
| # Pgen imports | ||
| from . import grammar, token, tokenize | ||
| class PgenGrammar(grammar.Grammar): | ||
| pass | ||
| class ParserGenerator(object): | ||
| def __init__(self, filename, stream=None): | ||
| close_stream = None | ||
| if stream is None: | ||
| stream = open(filename) | ||
| close_stream = stream.close | ||
| self.filename = filename | ||
| self.stream = stream | ||
| self.generator = tokenize.generate_tokens(stream.readline) | ||
| self.gettoken() # Initialize lookahead | ||
| self.dfas, self.startsymbol = self.parse() | ||
| if close_stream is not None: | ||
| close_stream() | ||
| self.first = {} # map from symbol name to set of tokens | ||
| self.addfirstsets() | ||
| def make_grammar(self): | ||
| c = PgenGrammar() | ||
| names = list(self.dfas.keys()) | ||
| names.sort() | ||
| names.remove(self.startsymbol) | ||
| names.insert(0, self.startsymbol) | ||
| for name in names: | ||
| i = 256 + len(c.symbol2number) | ||
| c.symbol2number[name] = i | ||
| c.number2symbol[i] = name | ||
| for name in names: | ||
| dfa = self.dfas[name] | ||
| states = [] | ||
| for state in dfa: | ||
| arcs = [] | ||
| for label, next in sorted(state.arcs.items()): | ||
| arcs.append((self.make_label(c, label), dfa.index(next))) | ||
| if state.isfinal: | ||
| arcs.append((0, dfa.index(state))) | ||
| states.append(arcs) | ||
| c.states.append(states) | ||
| c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) | ||
| c.start = c.symbol2number[self.startsymbol] | ||
| return c | ||
| def make_first(self, c, name): | ||
| rawfirst = self.first[name] | ||
| first = {} | ||
| for label in sorted(rawfirst): | ||
| ilabel = self.make_label(c, label) | ||
| ##assert ilabel not in first # XXX failed on <> ... != | ||
| first[ilabel] = 1 | ||
| return first | ||
| def make_label(self, c, label): | ||
| # XXX Maybe this should be a method on a subclass of converter? | ||
| ilabel = len(c.labels) | ||
| if label[0].isalpha(): | ||
| # Either a symbol name or a named token | ||
| if label in c.symbol2number: | ||
| # A symbol name (a non-terminal) | ||
| if label in c.symbol2label: | ||
| return c.symbol2label[label] | ||
| else: | ||
| c.labels.append((c.symbol2number[label], None)) | ||
| c.symbol2label[label] = ilabel | ||
| return ilabel | ||
| else: | ||
| # A named token (NAME, NUMBER, STRING) | ||
| itoken = getattr(token, label, None) | ||
| assert isinstance(itoken, int), label | ||
| assert itoken in token.tok_name, label | ||
| if itoken in c.tokens: | ||
| return c.tokens[itoken] | ||
| else: | ||
| c.labels.append((itoken, None)) | ||
| c.tokens[itoken] = ilabel | ||
| return ilabel | ||
| else: | ||
| # Either a keyword or an operator | ||
| assert label[0] in ('"', "'"), label | ||
| value = eval(label) | ||
| if value[0].isalpha(): | ||
| # A keyword | ||
| if value in c.keywords: | ||
| return c.keywords[value] | ||
| else: | ||
| c.labels.append((token.NAME, value)) | ||
| c.keywords[value] = ilabel | ||
| return ilabel | ||
| else: | ||
| # An operator (any non-numeric token) | ||
| itoken = grammar.opmap[value] # Fails if unknown token | ||
| if itoken in c.tokens: | ||
| return c.tokens[itoken] | ||
| else: | ||
| c.labels.append((itoken, None)) | ||
| c.tokens[itoken] = ilabel | ||
| return ilabel | ||
| def addfirstsets(self): | ||
| names = list(self.dfas.keys()) | ||
| names.sort() | ||
| for name in names: | ||
| if name not in self.first: | ||
| self.calcfirst(name) | ||
| #print name, self.first[name].keys() | ||
| def calcfirst(self, name): | ||
| dfa = self.dfas[name] | ||
| self.first[name] = None # dummy to detect left recursion | ||
| state = dfa[0] | ||
| totalset = {} | ||
| overlapcheck = {} | ||
| for label, next in state.arcs.items(): | ||
| if label in self.dfas: | ||
| if label in self.first: | ||
| fset = self.first[label] | ||
| if fset is None: | ||
| raise ValueError("recursion for rule %r" % name) | ||
| else: | ||
| self.calcfirst(label) | ||
| fset = self.first[label] | ||
| totalset.update(fset) | ||
| overlapcheck[label] = fset | ||
| else: | ||
| totalset[label] = 1 | ||
| overlapcheck[label] = {label: 1} | ||
| inverse = {} | ||
| for label, itsfirst in overlapcheck.items(): | ||
| for symbol in itsfirst: | ||
| if symbol in inverse: | ||
| raise ValueError("rule %s is ambiguous; %s is in the" | ||
| " first sets of %s as well as %s" % | ||
| (name, symbol, label, inverse[symbol])) | ||
| inverse[symbol] = label | ||
| self.first[name] = totalset | ||
| def parse(self): | ||
| dfas = {} | ||
| startsymbol = None | ||
| # MSTART: (NEWLINE | RULE)* ENDMARKER | ||
| while self.type != token.ENDMARKER: | ||
| while self.type == token.NEWLINE: | ||
| self.gettoken() | ||
| # RULE: NAME ':' RHS NEWLINE | ||
| name = self.expect(token.NAME) | ||
| self.expect(token.OP, ":") | ||
| a, z = self.parse_rhs() | ||
| self.expect(token.NEWLINE) | ||
| #self.dump_nfa(name, a, z) | ||
| dfa = self.make_dfa(a, z) | ||
| #self.dump_dfa(name, dfa) | ||
| oldlen = len(dfa) | ||
| self.simplify_dfa(dfa) | ||
| newlen = len(dfa) | ||
| dfas[name] = dfa | ||
| #print name, oldlen, newlen | ||
| if startsymbol is None: | ||
| startsymbol = name | ||
| return dfas, startsymbol | ||
| def make_dfa(self, start, finish): | ||
| # To turn an NFA into a DFA, we define the states of the DFA | ||
| # to correspond to *sets* of states of the NFA. Then do some | ||
| # state reduction. Let's represent sets as dicts with 1 for | ||
| # values. | ||
| assert isinstance(start, NFAState) | ||
| assert isinstance(finish, NFAState) | ||
| def closure(state): | ||
| base = {} | ||
| addclosure(state, base) | ||
| return base | ||
| def addclosure(state, base): | ||
| assert isinstance(state, NFAState) | ||
| if state in base: | ||
| return | ||
| base[state] = 1 | ||
| for label, next in state.arcs: | ||
| if label is None: | ||
| addclosure(next, base) | ||
| states = [DFAState(closure(start), finish)] | ||
| for state in states: # NB states grows while we're iterating | ||
| arcs = {} | ||
| for nfastate in state.nfaset: | ||
| for label, next in nfastate.arcs: | ||
| if label is not None: | ||
| addclosure(next, arcs.setdefault(label, {})) | ||
| for label, nfaset in sorted(arcs.items()): | ||
| for st in states: | ||
| if st.nfaset == nfaset: | ||
| break | ||
| else: | ||
| st = DFAState(nfaset, finish) | ||
| states.append(st) | ||
| state.addarc(st, label) | ||
| return states # List of DFAState instances; first one is start | ||
| def dump_nfa(self, name, start, finish): | ||
| print("Dump of NFA for", name) | ||
| todo = [start] | ||
| for i, state in enumerate(todo): | ||
| print(" State", i, state is finish and "(final)" or "") | ||
| for label, next in state.arcs: | ||
| if next in todo: | ||
| j = todo.index(next) | ||
| else: | ||
| j = len(todo) | ||
| todo.append(next) | ||
| if label is None: | ||
| print(" -> %d" % j) | ||
| else: | ||
| print(" %s -> %d" % (label, j)) | ||
| def dump_dfa(self, name, dfa): | ||
| print("Dump of DFA for", name) | ||
| for i, state in enumerate(dfa): | ||
| print(" State", i, state.isfinal and "(final)" or "") | ||
| for label, next in sorted(state.arcs.items()): | ||
| print(" %s -> %d" % (label, dfa.index(next))) | ||
| def simplify_dfa(self, dfa): | ||
| # This is not theoretically optimal, but works well enough. | ||
| # Algorithm: repeatedly look for two states that have the same | ||
| # set of arcs (same labels pointing to the same nodes) and | ||
| # unify them, until things stop changing. | ||
| # dfa is a list of DFAState instances | ||
| changes = True | ||
| while changes: | ||
| changes = False | ||
| for i, state_i in enumerate(dfa): | ||
| for j in range(i+1, len(dfa)): | ||
| state_j = dfa[j] | ||
| if state_i == state_j: | ||
| #print " unify", i, j | ||
| del dfa[j] | ||
| for state in dfa: | ||
| state.unifystate(state_j, state_i) | ||
| changes = True | ||
| break | ||
| def parse_rhs(self): | ||
| # RHS: ALT ('|' ALT)* | ||
| a, z = self.parse_alt() | ||
| if self.value != "|": | ||
| return a, z | ||
| else: | ||
| aa = NFAState() | ||
| zz = NFAState() | ||
| aa.addarc(a) | ||
| z.addarc(zz) | ||
| while self.value == "|": | ||
| self.gettoken() | ||
| a, z = self.parse_alt() | ||
| aa.addarc(a) | ||
| z.addarc(zz) | ||
| return aa, zz | ||
| def parse_alt(self): | ||
| # ALT: ITEM+ | ||
| a, b = self.parse_item() | ||
| while (self.value in ("(", "[") or | ||
| self.type in (token.NAME, token.STRING)): | ||
| c, d = self.parse_item() | ||
| b.addarc(c) | ||
| b = d | ||
| return a, b | ||
| def parse_item(self): | ||
| # ITEM: '[' RHS ']' | ATOM ['+' | '*'] | ||
| if self.value == "[": | ||
| self.gettoken() | ||
| a, z = self.parse_rhs() | ||
| self.expect(token.OP, "]") | ||
| a.addarc(z) | ||
| return a, z | ||
| else: | ||
| a, z = self.parse_atom() | ||
| value = self.value | ||
| if value not in ("+", "*"): | ||
| return a, z | ||
| self.gettoken() | ||
| z.addarc(a) | ||
| if value == "+": | ||
| return a, z | ||
| else: | ||
| return a, a | ||
| def parse_atom(self): | ||
| # ATOM: '(' RHS ')' | NAME | STRING | ||
| if self.value == "(": | ||
| self.gettoken() | ||
| a, z = self.parse_rhs() | ||
| self.expect(token.OP, ")") | ||
| return a, z | ||
| elif self.type in (token.NAME, token.STRING): | ||
| a = NFAState() | ||
| z = NFAState() | ||
| a.addarc(z, self.value) | ||
| self.gettoken() | ||
| return a, z | ||
| else: | ||
| self.raise_error("expected (...) or NAME or STRING, got %s/%s", | ||
| self.type, self.value) | ||
| def expect(self, type, value=None): | ||
| if self.type != type or (value is not None and self.value != value): | ||
| self.raise_error("expected %s/%s, got %s/%s", | ||
| type, value, self.type, self.value) | ||
| value = self.value | ||
| self.gettoken() | ||
| return value | ||
| def gettoken(self): | ||
| tup = next(self.generator) | ||
| while tup[0] in (tokenize.COMMENT, tokenize.NL): | ||
| tup = next(self.generator) | ||
| self.type, self.value, self.begin, self.end, self.line = tup | ||
| #print token.tok_name[self.type], repr(self.value) | ||
| def raise_error(self, msg, *args): | ||
| if args: | ||
| try: | ||
| msg = msg % args | ||
| except: | ||
| msg = " ".join([msg] + list(map(str, args))) | ||
| raise SyntaxError(msg, (self.filename, self.end[0], | ||
| self.end[1], self.line)) | ||
| class NFAState(object): | ||
| def __init__(self): | ||
| self.arcs = [] # list of (label, NFAState) pairs | ||
| def addarc(self, next, label=None): | ||
| assert label is None or isinstance(label, str) | ||
| assert isinstance(next, NFAState) | ||
| self.arcs.append((label, next)) | ||
| class DFAState(object): | ||
| def __init__(self, nfaset, final): | ||
| assert isinstance(nfaset, dict) | ||
| assert isinstance(next(iter(nfaset)), NFAState) | ||
| assert isinstance(final, NFAState) | ||
| self.nfaset = nfaset | ||
| self.isfinal = final in nfaset | ||
| self.arcs = {} # map from label to DFAState | ||
| def addarc(self, next, label): | ||
| assert isinstance(label, str) | ||
| assert label not in self.arcs | ||
| assert isinstance(next, DFAState) | ||
| self.arcs[label] = next | ||
| def unifystate(self, old, new): | ||
| for label, next in self.arcs.items(): | ||
| if next is old: | ||
| self.arcs[label] = new | ||
| def __eq__(self, other): | ||
| # Equality test -- ignore the nfaset instance variable | ||
| assert isinstance(other, DFAState) | ||
| if self.isfinal != other.isfinal: | ||
| return False | ||
| # Can't just return self.arcs == other.arcs, because that | ||
| # would invoke this method recursively, with cycles... | ||
| if len(self.arcs) != len(other.arcs): | ||
| return False | ||
| for label, next in self.arcs.items(): | ||
| if next is not other.arcs.get(label): | ||
| return False | ||
| return True | ||
| __hash__ = None # For Py3 compatibility. | ||
| def generate_grammar(filename="Grammar.txt"): | ||
| p = ParserGenerator(filename) | ||
| return p.make_grammar() |
| @@ -0,0 +1,85 @@ | ||
| #! /usr/bin/env python3 | ||
| """Token constants (from "token.h").""" | ||
| # Taken from Python (r53757) and modified to include some tokens | ||
| # originally monkeypatched in by pgen2.tokenize | ||
| #--start constants-- | ||
| ENDMARKER = 0 | ||
| NAME = 1 | ||
| NUMBER = 2 | ||
| STRING = 3 | ||
| NEWLINE = 4 | ||
| INDENT = 5 | ||
| DEDENT = 6 | ||
| LPAR = 7 | ||
| RPAR = 8 | ||
| LSQB = 9 | ||
| RSQB = 10 | ||
| COLON = 11 | ||
| COMMA = 12 | ||
| SEMI = 13 | ||
| PLUS = 14 | ||
| MINUS = 15 | ||
| STAR = 16 | ||
| SLASH = 17 | ||
| VBAR = 18 | ||
| AMPER = 19 | ||
| LESS = 20 | ||
| GREATER = 21 | ||
| EQUAL = 22 | ||
| DOT = 23 | ||
| PERCENT = 24 | ||
| BACKQUOTE = 25 | ||
| LBRACE = 26 | ||
| RBRACE = 27 | ||
| EQEQUAL = 28 | ||
| NOTEQUAL = 29 | ||
| LESSEQUAL = 30 | ||
| GREATEREQUAL = 31 | ||
| TILDE = 32 | ||
| CIRCUMFLEX = 33 | ||
| LEFTSHIFT = 34 | ||
| RIGHTSHIFT = 35 | ||
| DOUBLESTAR = 36 | ||
| PLUSEQUAL = 37 | ||
| MINEQUAL = 38 | ||
| STAREQUAL = 39 | ||
| SLASHEQUAL = 40 | ||
| PERCENTEQUAL = 41 | ||
| AMPEREQUAL = 42 | ||
| VBAREQUAL = 43 | ||
| CIRCUMFLEXEQUAL = 44 | ||
| LEFTSHIFTEQUAL = 45 | ||
| RIGHTSHIFTEQUAL = 46 | ||
| DOUBLESTAREQUAL = 47 | ||
| DOUBLESLASH = 48 | ||
| DOUBLESLASHEQUAL = 49 | ||
| AT = 50 | ||
| ATEQUAL = 51 | ||
| OP = 52 | ||
| COMMENT = 53 | ||
| NL = 54 | ||
| RARROW = 55 | ||
| AWAIT = 56 | ||
| ASYNC = 57 | ||
| ERRORTOKEN = 58 | ||
| N_TOKENS = 59 | ||
| NT_OFFSET = 256 | ||
| #--end constants-- | ||
| tok_name = {} | ||
| for _name, _value in list(globals().items()): | ||
| if type(_value) is type(0): | ||
| tok_name[_value] = _name | ||
| def ISTERMINAL(x): | ||
| return x < NT_OFFSET | ||
| def ISNONTERMINAL(x): | ||
| return x >= NT_OFFSET | ||
| def ISEOF(x): | ||
| return x == ENDMARKER |
| @@ -0,0 +1,40 @@ | ||
| # Copyright 2006 Google, Inc. All Rights Reserved. | ||
| # Licensed to PSF under a Contributor Agreement. | ||
| """Export the Python grammar and symbols.""" | ||
| # Python imports | ||
| import os | ||
| # Local imports | ||
| from .pgen2 import token | ||
| from .pgen2 import driver | ||
| from . import pytree | ||
| # The grammar file | ||
| _GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "Grammar.txt") | ||
| _PATTERN_GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), | ||
| "PatternGrammar.txt") | ||
| class Symbols(object): | ||
| def __init__(self, grammar): | ||
| """Initializer. | ||
| Creates an attribute for each grammar symbol (nonterminal), | ||
| whose value is the symbol's type (an int >= 256). | ||
| """ | ||
| for name, symbol in grammar.symbol2number.items(): | ||
| setattr(self, name, symbol) | ||
| python_grammar = driver.load_grammar(_GRAMMAR_FILE) | ||
| python_symbols = Symbols(python_grammar) | ||
| python_grammar_no_print_statement = python_grammar.copy() | ||
| del python_grammar_no_print_statement.keywords["print"] | ||
| pattern_grammar = driver.load_grammar(_PATTERN_GRAMMAR_FILE) | ||
| pattern_symbols = Symbols(pattern_grammar) |