diff --git a/demo/shedskin.sh b/demo/shedskin.sh new file mode 100755 index 0000000000..42add5bdeb --- /dev/null +++ b/demo/shedskin.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# Usage: +# ./shedskin.sh + +set -o nounset +set -o pipefail +set -o errexit + +# Problems +# - loading pickle for metadata. It has to dynamically look up classes. +# - it won't compile the pickle module due to its use of marshal! +# - TODO: we don't need metadata at all? + +# Fixed +# - import posix removed in runtime.py +# - _CheckType uses AttributeError: Shed Skin doesn't like it + +# Unfortunately the ShedSkin compiler crashes after 17 seconds with this error! +# +# ts = typestrnew(gx, types, cplusplus, node, check_extmod, depth, check_ret, var, tuple_check, mv=mv) +# File "/usr/lib/python2.7/dist-packages/shedskin/typestr.py", line 193, in typestrnew +# elif not node or infer.inode(gx, node).mv.module.builtin: +# AttributeError: 'NoneType' object has no attribute 'module' +# +# real 0m17.210s +# user 0m17.083s +# sys 0m0.084s + + +# 0.9.4 was released in 2015. Supposedly fixed in git! +# +# https://github.com/shedskin/shedskin/issues/203 + +install-latest() { + # NOTE: I manually transcribed what I did. Could use virtualenv? + pushd ~/git/languages/shedskin + python setup.py build + sudo python setup.py install +} + +make-tree() { + local out=_tmp/shedskin + mkdir -p $out + #cp -v asdl/{arith_parse.py,tdop.py} _devbuild/gen/demo_asdl.py $out + + # dependencies of generated code + # unpickle probably won't work + cp -v asdl/{const.py,runtime.py} $out +} + +run-python() { + pushd demo/shedskin + ./arith_parse.py '1+2' +} + +# With latest, this generates C++ code, but it doesn't compile. +# +# TODO: Try something based on tdop.py that is a single module? There are too +# many modules here. + +compile() { + pushd demo/shedskin + time shedskin arith_parse +} + +count-output() { + wc -l demo/shedskin/*.{cpp,hpp} Makefile +} + +"$@" diff --git a/demo/shedskin/arith_parse.py b/demo/shedskin/arith_parse.py new file mode 100755 index 0000000000..c3ce7fb075 --- /dev/null +++ b/demo/shedskin/arith_parse.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python +""" +arith_parse.py: Parse shell-like and C-like arithmetic. +""" +from __future__ import print_function + +import sys + +import tdop +from tdop import CompositeNode + +import demo_asdl + +arith_expr = demo_asdl.arith_expr +op_id_e = demo_asdl.op_id_e + + +# +# Null Denotation -- token that takes nothing on the left +# + +def NullConstant(p, token, bp): + if token.type == 'number': + return arith_expr.Const(token.val) + # We have to wrap a string in some kind of variant. + if token.type == 'name': + return arith_expr.ArithVar(token.val) + + raise AssertionError(token.type) + + +def NullParen(p, token, bp): + """ Arithmetic grouping """ + r = p.ParseUntil(bp) + p.Eat(')') + return r + + +def NullPrefixOp(p, token, bp): + """Prefix operator. + + Low precedence: return, raise, etc. + return x+y is return (x+y), not (return x) + y + + High precedence: logical negation, bitwise complement, etc. + !x && y is (!x) && y, not !(x && y) + """ + r = p.ParseUntil(bp) + return CompositeNode(token, [r]) + + +def NullIncDec(p, token, bp): + """ ++x or ++x[1] """ + right = p.ParseUntil(bp) + if right.token.type not in ('name', 'get'): + raise tdop.ParseError("Can't assign to %r (%s)" % (right, right.token)) + return CompositeNode(token, [right]) + + +# +# Left Denotation -- token that takes an expression on the left +# + +def LeftIncDec(p, token, left, rbp): + """ For i++ and i-- + """ + if left.token.type not in ('name', 'get'): + raise tdop.ParseError("Can't assign to %r (%s)" % (left, left.token)) + token.type = 'post' + token.type + return CompositeNode(token, [left]) + + +def LeftIndex(p, token, left, unused_bp): + """ index f[x+1] """ + # f[x] or f[x][y] + if not isinstance(left, demo_asdl.ArithVar): + raise tdop.ParseError("%s can't be indexed" % left) + index = p.ParseUntil(0) + if p.AtToken(':'): + p.Next() + end = p.ParseUntil(0) + else: + end = None + + p.Eat(']') + + # TODO: If you see ], then + # 1:4 + # 1:4:2 + # Both end and step are optional + + if end: + return demo_asdl.Slice(left, index, end, None) + else: + return demo_asdl.Index(left, index) + + +def LeftTernary(p, token, left, bp): + """ e.g. a > 1 ? x : y """ + true_expr = p.ParseUntil(bp) + p.Eat(':') + false_expr = p.ParseUntil(bp) + children = [left, true_expr, false_expr] + return CompositeNode(token, children) + + +def LeftBinaryOp(p, token, left, rbp): + """ Normal binary operator like 1+2 or 2*3, etc. """ + if token.val == '+': + op_id_ = op_id_e.Plus + elif token.val == '-': + op_id_ = op_id_e.Minus + elif token.val == '*': + op_id_ = op_id_e.Star + else: + raise AssertionError(token.val) + return arith_expr.ArithBinary(op_id_, left, p.ParseUntil(rbp)) + + +def LeftAssign(p, token, left, rbp): + """ Normal binary operator like 1+2 or 2*3, etc. """ + # x += 1, or a[i] += 1 + if left.token.type not in ('name', 'get'): + raise tdop.ParseError("Can't assign to %r (%s)" % (left, left.token)) + return CompositeNode(token, [left, p.ParseUntil(rbp)]) + + +def LeftComma(p, token, left, rbp): + """ foo, bar, baz + + Could be sequencing operator, or tuple without parens + """ + r = p.ParseUntil(rbp) + if left.token.type == ',': # Keep adding more children + left.children.append(r) + return left + children = [left, r] + return CompositeNode(token, children) + + +# For overloading of , inside function calls +COMMA_PREC = 1 + +def LeftFuncCall(p, token, left, unused_bp): + """ Function call f(a, b). """ + args = [] + # f(x) or f[i](x) + if not isinstance(left, demo_asdl.ArithVar): + raise tdop.ParseError("%s can't be called" % left) + func_name = left.name # get a string + + while not p.AtToken(')'): + # We don't want to grab the comma, e.g. it is NOT a sequence operator. So + # set the precedence to 5. + args.append(p.ParseUntil(COMMA_PREC)) + if p.AtToken(','): + p.Next() + p.Eat(")") + return demo_asdl.FuncCall(func_name, args) + + +def MakeShellParserSpec(): + """ + Create a parser. + + Compare the code below with this table of C operator precedence: + http://en.cppreference.com/w/c/language/operator_precedence + """ + spec = tdop.ParserSpec() + + spec.Left(31, LeftIncDec, ['++', '--']) + spec.Left(31, LeftFuncCall, ['(']) + spec.Left(31, LeftIndex, ['[']) + + # 29 -- binds to everything except function call, indexing, postfix ops + spec.Null(29, NullIncDec, ['++', '--']) + spec.Null(29, NullPrefixOp, ['+', '!', '~', '-']) + + # Right associative: 2 ** 3 ** 2 == 2 ** (3 ** 2) + spec.LeftRightAssoc(27, LeftBinaryOp, ['**']) + spec.Left(25, LeftBinaryOp, ['*', '/', '%']) + + spec.Left(23, LeftBinaryOp, ['+', '-']) + spec.Left(21, LeftBinaryOp, ['<<', '>>']) + spec.Left(19, LeftBinaryOp, ['<', '>', '<=', '>=']) + spec.Left(17, LeftBinaryOp, ['!=', '==']) + + spec.Left(15, LeftBinaryOp, ['&']) + spec.Left(13, LeftBinaryOp, ['^']) + spec.Left(11, LeftBinaryOp, ['|']) + spec.Left(9, LeftBinaryOp, ['&&']) + spec.Left(7, LeftBinaryOp, ['||']) + + spec.LeftRightAssoc(5, LeftTernary, ['?']) + + # Right associative: a = b = 2 is a = (b = 2) + spec.LeftRightAssoc(3, LeftAssign, [ + '=', + '+=', '-=', '*=', '/=', '%=', + '<<=', '>>=', '&=', '^=', '|=']) + + spec.Left(COMMA_PREC, LeftComma, [',']) + + # 0 precedence -- doesn't bind until ) + spec.Null(0, NullParen, ['(']) # for grouping + + # -1 precedence -- never used + spec.Null(-1, NullConstant, ['name', 'number']) + spec.Null(-1, tdop.NullError, [')', ']', ':', 'eof']) + + return spec + + +def MakeParser(s): + """Used by tests.""" + spec = MakeShellParserSpec() + lexer = tdop.Tokenize(s) + p = tdop.Parser(spec, lexer) + return p + + +def ParseShell(s, expected=None): + """Used by tests.""" + p = MakeParser(s) + tree = p.Parse() + + sexpr = repr(tree) + if expected is not None: + assert sexpr == expected, '%r != %r' % (sexpr, expected) + + #print('%-40s %s' % (s, sexpr)) + return tree + + +def main(argv): + try: + s = argv[1] + except IndexError: + print('Usage: ./arith_parse.py EXPRESSION') + else: + try: + tree = ParseShell(s) + except tdop.ParseError as e: + print('Error parsing %r: %s' % (s, e), file=sys.stderr) + print(tree) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/demo/shedskin/const.py b/demo/shedskin/const.py new file mode 100644 index 0000000000..8ac0b72864 --- /dev/null +++ b/demo/shedskin/const.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +""" +const.py +""" + +DEFAULT_INT_WIDTH = 3 # 24 bits + +# 2^24 - 1 is used as an invalid/uninitialized value for ASDL integers. + +# Why? We have a few use cases for invalid/sentinel values: +# - span_id, line_id. Sometimes we don't have a span ID. +# - file descriptor: 'read x < f.txt' vs 'read x 0< f.txt' +# +# Other options for representation: +# +# 1. ADSL could use signed integers, then -1 is valid. +# 2. Use a type like fd = None | Some(int fd) +# +# I don't like #1 because ASDL is lazily-decoded, and then we have to do sign +# extension on demand. (24 bits to 32 or 64). As far as I can tell, sign +# extension requires a branch, at least in portable C (on the sign bit). +# +# Thes second option is semantically cleaner. But it needlessly +# inflates the size of both the source code and the data. Instead of having a +# single "inline" integer, we would need a reference to another value. +# +# We could also try to do some fancy thing like fd = None | +# Range<1..max_fd>(fd), with smart encoding. But that is overkill for these +# use cases. +# +# Using InvalidInt instead of -1 seems like a good compromise. + +NO_INTEGER = (1 << (DEFAULT_INT_WIDTH * 8)) - 1 + +# NOTE: In Python: 1 << (n * 8) - 1 is wrong! I thought that bit shift would +# have higher precedence. diff --git a/demo/shedskin/demo_asdl.py b/demo/shedskin/demo_asdl.py new file mode 100644 index 0000000000..ecec9d4934 --- /dev/null +++ b/demo/shedskin/demo_asdl.py @@ -0,0 +1,255 @@ +import const # For const.NO_INTEGER +import runtime +#from pylib import unpickle + +#from core import util + +#f = util.GetResourceLoader().open('_devbuild/demo_asdl.pickle') +#TYPE_LOOKUP = unpickle.load_v2_subset(f) +#f.close() +#TYPE_LOOKUP = {} + +class op_id_e(runtime.SimpleObj): + #ASDL_TYPE = TYPE_LOOKUP['op_id'] + pass + +op_id_e.Plus = op_id_e(1, 'Plus') +op_id_e.Minus = op_id_e(2, 'Minus') +op_id_e.Star = op_id_e(3, 'Star') + +class cflow_e(object): + Break = 1 + Continue = 2 + Return = 3 + +class cflow(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['cflow'] + pass + +class cflow__Break(cflow): + #ASDL_TYPE = TYPE_LOOKUP['cflow__Break'] + tag = 1 + +class cflow__Continue(cflow): + #ASDL_TYPE = TYPE_LOOKUP['cflow__Continue'] + tag = 2 + +class cflow__Return(cflow): + tag = 3 + #ASDL_TYPE = TYPE_LOOKUP['cflow__Return'] + __slots__ = ('status', 'spids') + + def __init__(self, status=None, spids=None): + self.status = status + self.spids = spids or [] + +cflow.Break = cflow__Break +cflow.Continue = cflow__Continue +cflow.Return = cflow__Return + +class source_location(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['source_location'] + __slots__ = ('path', 'line', 'col', 'length', 'spids') + + def __init__(self, path=None, line=None, col=None, length=None, spids=None): + self.path = path + self.line = line + self.col = col + self.length = length + self.spids = spids or [] + +class token(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['token'] + __slots__ = ('id', 'value', 'span_id', 'spids') + + def __init__(self, id=None, value=None, span_id=None, spids=None): + self.id = id + self.value = value + self.span_id = span_id or const.NO_INTEGER + self.spids = spids or [] + +class assign(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['assign'] + __slots__ = ('name', 'flags', 'spids') + + def __init__(self, name=None, flags=None, spids=None): + self.name = name + self.flags = flags or [] + self.spids = spids or [] + +class arith_expr_e(object): + Const = 1 + ArithVar = 2 + ArithUnary = 3 + ArithBinary = 4 + FuncCall = 5 + ForwardRef = 6 + Index = 7 + Slice = 8 + +class arith_expr(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['arith_expr'] + pass + +class arith_expr__Const(arith_expr): + tag = 1 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__Const'] + __slots__ = ('i', 'spids') + + def __init__(self, i=None, spids=None): + self.i = i + self.spids = spids or [] + +class arith_expr__ArithVar(arith_expr): + tag = 2 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__ArithVar'] + __slots__ = ('name', 'spids') + + def __init__(self, name=None, spids=None): + self.name = name + self.spids = spids or [] + +class arith_expr__ArithUnary(arith_expr): + tag = 3 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__ArithUnary'] + __slots__ = ('op_id', 'a', 'spids') + + def __init__(self, op_id=None, a=None, spids=None): + self.op_id = op_id + self.a = a + self.spids = spids or [] + +class arith_expr__ArithBinary(arith_expr): + tag = 4 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__ArithBinary'] + __slots__ = ('op_id', 'left', 'right', 'spids') + + def __init__(self, op_id=None, left=None, right=None, spids=None): + self.op_id = op_id + self.left = left + self.right = right + self.spids = spids or [] + +class arith_expr__FuncCall(arith_expr): + tag = 5 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__FuncCall'] + __slots__ = ('name', 'args', 'spids') + + def __init__(self, name=None, args=None, spids=None): + self.name = name + self.args = args or [] + self.spids = spids or [] + +class arith_expr__ForwardRef(arith_expr): + tag = 6 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__ForwardRef'] + __slots__ = ('b', 'spids') + + def __init__(self, b=None, spids=None): + self.b = b + self.spids = spids or [] + +class arith_expr__Index(arith_expr): + tag = 7 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__Index'] + __slots__ = ('a', 'index', 'spids') + + def __init__(self, a=None, index=None, spids=None): + self.a = a + self.index = index + self.spids = spids or [] + +class arith_expr__Slice(arith_expr): + tag = 8 + #ASDL_TYPE = TYPE_LOOKUP['arith_expr__Slice'] + __slots__ = ('a', 'begin', 'end', 'stride', 'spids') + + def __init__(self, a=None, begin=None, end=None, stride=None, spids=None): + self.a = a + self.begin = begin or None + self.end = end or None + self.stride = stride or None + self.spids = spids or [] + +arith_expr.Const = arith_expr__Const +arith_expr.ArithVar = arith_expr__ArithVar +arith_expr.ArithUnary = arith_expr__ArithUnary +arith_expr.ArithBinary = arith_expr__ArithBinary +arith_expr.FuncCall = arith_expr__FuncCall +arith_expr.ForwardRef = arith_expr__ForwardRef +arith_expr.Index = arith_expr__Index +arith_expr.Slice = arith_expr__Slice + +class word(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['word'] + __slots__ = ('value', 'spids') + + def __init__(self, value=None, spids=None): + self.value = value + self.spids = spids or [] + +class bool_expr_e(object): + BoolBinary = 1 + BoolUnary = 2 + LogicalNot = 3 + LogicalAnd = 4 + LogicalOr = 5 + +class bool_expr(runtime.CompoundObj): + #ASDL_TYPE = TYPE_LOOKUP['bool_expr'] + pass + +class bool_expr__BoolBinary(bool_expr): + tag = 1 + #ASDL_TYPE = TYPE_LOOKUP['bool_expr__BoolBinary'] + __slots__ = ('left', 'right', 'spids') + + def __init__(self, left=None, right=None, spids=None): + self.left = left + self.right = right + self.spids = spids or [] + +class bool_expr__BoolUnary(bool_expr): + tag = 2 + #ASDL_TYPE = TYPE_LOOKUP['bool_expr__BoolUnary'] + __slots__ = ('child', 'spids') + + def __init__(self, child=None, spids=None): + self.child = child + self.spids = spids or [] + +class bool_expr__LogicalNot(bool_expr): + tag = 3 + #ASDL_TYPE = TYPE_LOOKUP['bool_expr__LogicalNot'] + __slots__ = ('b', 'spids') + + def __init__(self, b=None, spids=None): + self.b = b + self.spids = spids or [] + +class bool_expr__LogicalAnd(bool_expr): + tag = 4 + #ASDL_TYPE = TYPE_LOOKUP['bool_expr__LogicalAnd'] + __slots__ = ('left', 'right', 'spids') + + def __init__(self, left=None, right=None, spids=None): + self.left = left + self.right = right + self.spids = spids or [] + +class bool_expr__LogicalOr(bool_expr): + tag = 5 + #ASDL_TYPE = TYPE_LOOKUP['bool_expr__LogicalOr'] + __slots__ = ('left', 'right', 'spids') + + def __init__(self, left=None, right=None, spids=None): + self.left = left + self.right = right + self.spids = spids or [] + +bool_expr.BoolBinary = bool_expr__BoolBinary +bool_expr.BoolUnary = bool_expr__BoolUnary +bool_expr.LogicalNot = bool_expr__LogicalNot +bool_expr.LogicalAnd = bool_expr__LogicalAnd +bool_expr.LogicalOr = bool_expr__LogicalOr + diff --git a/demo/shedskin/runtime.py b/demo/shedskin/runtime.py new file mode 100644 index 0000000000..c49b66469a --- /dev/null +++ b/demo/shedskin/runtime.py @@ -0,0 +1,171 @@ +#!/usr/bin/python +""" +runtime.py: Objects and types needed at runtime. + +asdl/front_end.py and asdl/asdl_.py should not be shipped with the binary. +They are only needed for the gen_python.py compiler. +""" +from __future__ import print_function + +#import posix + +import cStringIO +Buffer = cStringIO.StringIO + + +# Type Descriptors +# +# These are more convenient than using the AST directly, since it still has +# string type names? +# +# Although we share Product and Sum. + +class _RuntimeType(object): + """A node hierarchy that exists at runtime.""" + pass + + +class StrType(_RuntimeType): + def __repr__(self): + return '' + + +class IntType(_RuntimeType): + def __repr__(self): + return '' + + +class BoolType(_RuntimeType): + def __repr__(self): + return '' + + +class DictType(_RuntimeType): + def __repr__(self): + return '' + + +class ArrayType(_RuntimeType): + def __init__(self, desc): + self.desc = desc + + def __repr__(self): + return '' % self.desc + + +class MaybeType(_RuntimeType): + def __init__(self, desc): + self.desc = desc # another descriptor + + def __repr__(self): + return '' % self.desc + + +class UserType(_RuntimeType): + def __init__(self, typ): + assert isinstance(typ, type), typ + self.typ = typ + + def __repr__(self): + return '' % self.typ + + +class SumType(_RuntimeType): + """Dummy node that doesn't require any reflection. + + obj.ASDL_TYPE points directly to the constructor, which you reflect on. + """ + def __init__(self, is_simple): + self.is_simple = is_simple # for type checking + self.cases = [] # list of _RuntimeType for type checking + + def __repr__(self): + # We need an entry for this but we don't use it? + return '' % ( + len(self.cases), id(self)) + + +class CompoundType(_RuntimeType): + """A product or Constructor instance. Both have fields.""" + def __init__(self, fields): + # List of (name, _RuntimeType) tuples. + # NOTE: This list may be mutated after its set. + self.fields = fields + + def __repr__(self): + return '' % self.fields + + def GetFieldNames(self): + for field_name, _ in self.fields: + yield field_name + + def GetFields(self): + for field_name, descriptor in self.fields: + yield field_name, descriptor + + def LookupFieldType(self, field_name): + """ + NOTE: Only used by py_meta.py. + """ + for n, descriptor in self.fields: + if n == field_name: + return descriptor + raise TypeError('Invalid field %r' % field_name) + + +BUILTIN_TYPES = { + 'string': StrType(), + 'int': IntType(), + 'bool': BoolType(), + 'dict': DictType(), +} + + + + +class Obj(object): + # NOTE: We're using CAPS for these static fields, since they are constant at + # runtime after metaprogramming. + ASDL_TYPE = None # Used for type checking + + +class SimpleObj(Obj): + """An enum value. + + Other simple objects: int, str, maybe later a float. + """ + def __init__(self, enum_id, name): + self.enum_id = enum_id + self.name = name + + # TODO: Why is __hash__ needed? Otherwise native/fastlex_test.py fails. + # util.Enum required it too. I thought that instances would hash by + # identity? + # + # Example: + # class bool_arg_type_e(py_meta.SimpleObj): + # ASDL_TYPE = TYPE_LOOKUP['bool_arg_type'] + # bool_arg_type_e.Undefined = bool_arg_type_e(1, 'Undefined') + + def __hash__(self): + # Could it be the integer self.enum_id? + return hash(self.__class__.__name__ + self.name) + + def __repr__(self): + return '<%s %s %s>' % (self.__class__.__name__, self.name, self.enum_id) + + +NUM_TYPE_CHECKS = 0 + +class CompoundObj(Obj): + # TODO: Remove tag? + # The tag is always set for constructor types, which are subclasses of sum + # types. Never set for product types. + tag = None + + +# Other possible dynamic checking: +# - CheckUnassigned in the constructor? Fields should be all initialized or +# none. +# - Maybe spids should never be mutated? It can only be appended to? +# - SimpleObj could deny all __setattr__? diff --git a/demo/shedskin/tdop.py b/demo/shedskin/tdop.py new file mode 100644 index 0000000000..6ea74d563a --- /dev/null +++ b/demo/shedskin/tdop.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +""" +tdop.py +""" + +import re + + +class ParseError(Exception): + pass + + +# +# Default parsing functions give errors +# + +def NullError(p, token, bp): + raise ParseError("%s can't be used in prefix position" % token) + + +def LeftError(p, token, left, rbp): + # Hm is this not called because of binding power? + raise ParseError("%s can't be used in infix position" % token) + + +# +# Input +# + +class Token: + def __init__(self, type, val, loc=None): + self.type = type + self.val = val + + def __repr__(self): + return '' % (self.type, self.val) + + +# +# Using the pattern here: http://effbot.org/zone/xml-scanner.htm +# + +# NOTE: () and [] need to be on their own so (-1+2) works +TOKEN_RE = re.compile(""" +\s* (?: (\d+) | (\w+) | ( [\-\+\*/%!~<>=&^|?:,]+ ) | ([\(\)\[\]]) ) +""", re.VERBOSE) + +def Tokenize(s): + for item in TOKEN_RE.findall(s): + if item[0]: + typ = 'number' + val = int(item[0]) + elif item[1]: + typ = 'name' + val = item[1] + elif item[2]: + typ = item[2] + val = item[2] + elif item[3]: + typ = item[3] + val = item[3] + yield Token(typ, val, loc=(0, 0)) + + +# +# Simple and Composite AST nodes +# + +class Node(object): + def __init__(self, token): + """ + Args: + type: token type (operator, etc.) + val: token val, only important for number and string + """ + self.token = token + + def __repr__(self): + return str(self.token.val) + + +class CompositeNode(Node): + def __init__(self, token, children): + """ + Args: + type: token type (operator, etc.) + """ + Node.__init__(self, token) + self.children = children + + def __repr__(self): + args = ''.join([" " + repr(c) for c in self.children]) + return "(" + self.token.type + args + ")" + + +# +# Parser definition +# + +class LeftInfo(object): + """Row for operator. + + In C++ this should be a big array. + """ + def __init__(self, led=None, lbp=0, rbp=0): + self.led = led or LeftError + self.lbp = lbp + self.rbp = rbp + + +class NullInfo(object): + """Row for operator. + + In C++ this should be a big array. + """ + def __init__(self, nud=None, bp=0): + self.nud = nud or NullError + self.bp = bp + + +class ParserSpec(object): + """Specification for a TDOP parser.""" + + def __init__(self): + self.null_lookup = {} + self.left_lookup = {} + + def Null(self, bp, nud, tokens): + """Register a token that doesn't take anything on the left. + + Examples: constant, prefix operator, error. + """ + for token in tokens: + self.null_lookup[token] = NullInfo(nud=nud, bp=bp) + if token not in self.left_lookup: + self.left_lookup[token] = LeftInfo() # error + + def _RegisterLed(self, lbp, rbp, led, tokens): + for token in tokens: + if token not in self.null_lookup: + self.null_lookup[token] = NullInfo(NullError) + self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led) + + def Left(self, bp, led, tokens): + """Register a token that takes an expression on the left.""" + self._RegisterLed(bp, bp, led, tokens) + + def LeftRightAssoc(self, bp, led, tokens): + """Register a right associative operator.""" + self._RegisterLed(bp, bp-1, led, tokens) + + def LookupNull(self, token): + """Get the parsing function and precedence for a null position token.""" + try: + nud = self.null_lookup[token] + except KeyError: + raise ParseError('Unexpected token %r' % token) + return nud + + def LookupLeft(self, token): + """Get the parsing function and precedence for a left position token.""" + try: + led = self.left_lookup[token] + except KeyError: + raise ParseError('Unexpected token %r' % token) + return led + + +EOF_TOKEN = Token('eof', 'eof') + + +class Parser(object): + """Recursive TDOP parser.""" + + def __init__(self, spec, lexer): + self.spec = spec + self.lexer = lexer # iterable + self.token = None # current token + + def AtToken(self, token_type): + """Test if we are looking at a token.""" + return self.token.type == token_type + + def Next(self): + """Move to the next token.""" + try: + t = self.lexer.next() + except StopIteration: + t = EOF_TOKEN + self.token = t + + def Eat(self, val): + """Assert the value of the current token, then move to the next token.""" + if val and not self.AtToken(val): + raise ParseError('expected %s, got %s' % (val, self.token)) + self.Next() + + def ParseUntil(self, rbp): + """ + Parse to the right, eating tokens until we encounter a token with binding + power LESS THAN OR EQUAL TO rbp. + """ + if self.AtToken('eof'): + raise ParseError('Unexpected end of input') + + t = self.token + self.Next() # skip over the token, e.g. ! ~ + - + + null_info = self.spec.LookupNull(t.type) + node = null_info.nud(self, t, null_info.bp) + + while True: + t = self.token + left_info = self.spec.LookupLeft(t.type) + + # Examples: + # If we see 1*2+ , rbp = 27 and lbp = 25, so stop. + # If we see 1+2+ , rbp = 25 and lbp = 25, so stop. + # If we see 1**2**, rbp = 26 and lbp = 27, so keep going. + if rbp >= left_info.lbp: + break + self.Next() # skip over the token, e.g. / * + + node = left_info.led(self, t, node, left_info.rbp) + + return node + + def Parse(self): + self.Next() + return self.ParseUntil(0)