View
@@ -0,0 +1,242 @@
#!/usr/bin/python3
"""
arith_parse.py: Parse shell-like and C-like arithmetic.
"""
import sys
import tdop
from tdop import Node, CompositeNode
import arith_ast
#
# Null Denotation -- token that takes nothing on the left
#
def NullConstant(p, token, bp):
if token.type == 'number':
return arith_ast.Const(token.val)
# We have to wrap a string in some kind of variant.
if token.type == 'name':
return arith_ast.ArithVar(token.val)
raise AssertionError(token.type)
def NullParen(p, token, bp):
""" Arithmetic grouping """
r = p.ParseUntil(bp)
p.Eat(')')
return r
def NullPrefixOp(p, token, bp):
"""Prefix operator.
Low precedence: return, raise, etc.
return x+y is return (x+y), not (return x) + y
High precedence: logical negation, bitwise complement, etc.
!x && y is (!x) && y, not !(x && y)
"""
r = p.ParseUntil(bp)
return CompositeNode(token, [r])
def NullIncDec(p, token, bp):
""" ++x or ++x[1] """
right = p.ParseUntil(bp)
if right.token.type not in ('name', 'get'):
raise tdop.ParseError("Can't assign to %r (%s)" % (right, right.token))
return CompositeNode(token, [right])
#
# Left Denotation -- token that takes an expression on the left
#
def LeftIncDec(p, token, left, rbp):
""" For i++ and i--
"""
if left.token.type not in ('name', 'get'):
raise tdop.ParseError("Can't assign to %r (%s)" % (left, left.token))
token.type = 'post' + token.type
return CompositeNode(token, [left])
def LeftIndex(p, token, left, unused_bp):
""" index f[x+1] """
# f[x] or f[x][y]
if not isinstance(left, arith_ast.ArithVar):
raise tdop.ParseError("%s can't be indexed" % left)
index = p.ParseUntil(0)
if p.AtToken(':'):
p.Next()
end = p.ParseUntil(0)
else:
end = None
p.Eat(']')
# TODO: If you see ], then
# 1:4
# 1:4:2
# Both end and step are optional
if end:
return arith_ast.Slice(left, index, end, None)
else:
return arith_ast.Index(left, index)
def LeftTernary(p, token, left, bp):
""" e.g. a > 1 ? x : y """
true_expr = p.ParseUntil(bp)
p.Eat(':')
false_expr = p.ParseUntil(bp)
children = [left, true_expr, false_expr]
return CompositeNode(token, children)
def LeftBinaryOp(p, token, left, rbp):
""" Normal binary operator like 1+2 or 2*3, etc. """
if token.val == '+':
op_id = arith_ast.op_id.Plus
elif token.val == '-':
op_id = arith_ast.op_id.Minus
elif token.val == '*':
op_id = arith_ast.op_id.Star
else:
raise AssertionError(token.val)
return arith_ast.ArithBinary(op_id, left, p.ParseUntil(rbp))
def LeftAssign(p, token, left, rbp):
""" Normal binary operator like 1+2 or 2*3, etc. """
# x += 1, or a[i] += 1
if left.token.type not in ('name', 'get'):
raise tdop.ParseError("Can't assign to %r (%s)" % (left, left.token))
return CompositeNode(token, [left, p.ParseUntil(rbp)])
def LeftComma(p, token, left, rbp):
""" foo, bar, baz
Could be sequencing operator, or tuple without parens
"""
r = p.ParseUntil(rbp)
if left.token.type == ',': # Keep adding more children
left.children.append(r)
return left
children = [left, r]
return CompositeNode(token, children)
# For overloading of , inside function calls
COMMA_PREC = 1
def LeftFuncCall(p, token, left, unused_bp):
""" Function call f(a, b). """
args = []
# f(x) or f[i](x)
if not isinstance(left, arith_ast.ArithVar):
raise tdop.ParseError("%s can't be called" % left)
func_name = left.name # get a string
while not p.AtToken(')'):
# We don't want to grab the comma, e.g. it is NOT a sequence operator. So
# set the precedence to 5.
args.append(p.ParseUntil(COMMA_PREC))
if p.AtToken(','):
p.Next()
p.Eat(")")
return arith_ast.FuncCall(func_name, args)
def MakeShellParserSpec():
"""
Create a parser.
Compare the code below with this table of C operator precedence:
http://en.cppreference.com/w/c/language/operator_precedence
"""
spec = tdop.ParserSpec()
spec.Left(31, LeftIncDec, ['++', '--'])
spec.Left(31, LeftFuncCall, ['('])
spec.Left(31, LeftIndex, ['['])
# 29 -- binds to everything except function call, indexing, postfix ops
spec.Null(29, NullIncDec, ['++', '--'])
spec.Null(29, NullPrefixOp, ['+', '!', '~', '-'])
# Right associative: 2 ** 3 ** 2 == 2 ** (3 ** 2)
spec.LeftRightAssoc(27, LeftBinaryOp, ['**'])
spec.Left(25, LeftBinaryOp, ['*', '/', '%'])
spec.Left(23, LeftBinaryOp, ['+', '-'])
spec.Left(21, LeftBinaryOp, ['<<', '>>'])
spec.Left(19, LeftBinaryOp, ['<', '>', '<=', '>='])
spec.Left(17, LeftBinaryOp, ['!=', '=='])
spec.Left(15, LeftBinaryOp, ['&'])
spec.Left(13, LeftBinaryOp, ['^'])
spec.Left(11, LeftBinaryOp, ['|'])
spec.Left(9, LeftBinaryOp, ['&&'])
spec.Left(7, LeftBinaryOp, ['||'])
spec.LeftRightAssoc(5, LeftTernary, ['?'])
# Right associative: a = b = 2 is a = (b = 2)
spec.LeftRightAssoc(3, LeftAssign, [
'=',
'+=', '-=', '*=', '/=', '%=',
'<<=', '>>=', '&=', '^=', '|='])
spec.Left(COMMA_PREC, LeftComma, [','])
# 0 precedence -- doesn't bind until )
spec.Null(0, NullParen, ['(']) # for grouping
# -1 precedence -- never used
spec.Null(-1, NullConstant, ['name', 'number'])
spec.Null(-1, tdop.NullError, [')', ']', ':', 'eof'])
return spec
def MakeParser(s):
"""Used by tests."""
spec = MakeShellParserSpec()
lexer = tdop.Tokenize(s)
p = tdop.Parser(spec, lexer)
return p
def ParseShell(s, expected=None):
"""Used by tests."""
p = MakeParser(s)
tree = p.Parse()
sexpr = repr(tree)
if expected is not None:
assert sexpr == expected, '%r != %r' % (sexpr, expected)
print('%-40s %s' % (s, sexpr))
return tree
def main(argv):
try:
s = argv[1]
except IndexError:
print('Usage: ./arith_parse.py EXPRESSION')
else:
try:
tree = ParseShell(s)
except tdop.ParseError as e:
print('Error parsing %r: %s' % (s, e), file=sys.stderr)
if __name__ == '__main__':
main(sys.argv)
View
@@ -0,0 +1,173 @@
#!/usr/bin/python3
import tdop
import arith_parse
def _assertParseError(make_parser, s, error_substring=''):
p = make_parser(s)
try:
node = p.Parse()
except tdop.ParseError as e:
err = str(e)
if error_substring in err:
print('got expected error for %s: %s' % (s, err))
else:
raise AssertionError('Expected %r to be in %r' % (error_substring, err))
else:
raise AssertionError('%r should have failed' % s)
def TestArith(t_parse):
t_parse('1+2+3', '(+ (+ 1 2) 3)')
t_parse('1+2*3', '(+ 1 (* 2 3))')
t_parse('4*(2+3)', '(* 4 (+ 2 3))')
t_parse('(2+3)*4', '(* (+ 2 3) 4)')
t_parse('1<2', '(< 1 2)')
t_parse('x=3', '(= x 3)')
t_parse('x = 2*3', '(= x (* 2 3))')
t_parse('x = y', '(= x y)')
t_parse('x*y - y*z', '(- (* x y) (* y z))')
t_parse('x/y - y%z', '(- (/ x y) (% y z))')
t_parse("x = y", "(= x y)")
t_parse('2 ** 3 ** 2', '(** 2 (** 3 2))')
t_parse('a = b = 10', '(= a (= b 10))')
t_parse('x = ((y*4)-2)', '(= x (- (* y 4) 2))')
t_parse('x - -y', '(- x (- y))')
t_parse("-1 * -2", "(* (- 1) (- 2))")
t_parse("-x * -y", "(* (- x) (- y))")
t_parse('x - -234', '(- x (- 234))')
# Python doesn't allow this
t_parse('x += y += 3', '(+= x (+= y 3))')
# This is sort of nonsensical, but bash allows it. The 1 is discarded as
# the first element of the comma operator.
t_parse('x[1,2]', '(get x (, 1 2))')
# Python doesn't have unary +
t_parse('+1 - +2', '(- (+ 1) (+ 2))')
# LHS
t_parse('f[x] += 1', '(+= (get f x) 1)')
def TestBitwise(t_parse):
t_parse("~1 | ~2", "(| (~ 1) (~ 2))")
t_parse("x & y | a & b", "(| (& x y) (& a b))")
t_parse("~x ^ y", "(^ (~ x) y)")
t_parse("x << y | y << z", "(| (<< x y) (<< y z))")
t_parse("a ^= b-1", "(^= a (- b 1))")
def TestLogical(t_parse):
t_parse("a && b || c && d", "(|| (&& a b) (&& c d))")
t_parse("!a && !b", "(&& (! a) (! b))")
t_parse("a != b && c == d", "(&& (!= a b) (== c d))")
t_parse("a > b ? 0 : 1", "(? (> a b) 0 1)")
t_parse("a > b ? x+1 : y+1", "(? (> a b) (+ x 1) (+ y 1))")
t_parse("1 ? true1 : 2 ? true2 : false", "(? 1 true1 (? 2 true2 false))")
t_parse("1 ? true1 : (2 ? true2 : false)", "(? 1 true1 (? 2 true2 false))")
t_parse("1 ? (2 ? true : false1) : false2", "(? 1 (? 2 true false1) false2)")
t_parse("1 ? 2 ? true : false1 : false2", "(? 1 (? 2 true false1) false2)")
# Should have higher precedence than comma
t_parse("x ? 1 : 2, y ? 3 : 4", "(, (? x 1 2) (? y 3 4))")
def TestUnary(t_parse):
t_parse("!x", "(! x)")
t_parse("x--", "(post-- x)")
t_parse("x[1]--", "(post-- (get x 1))")
t_parse("--x", "(-- x)")
t_parse("++x[1]", "(++ (get x 1))")
t_parse("!x--", "(! (post-- x))")
t_parse("~x++", "(~ (post++ x))")
t_parse("x++ - y++", "(- (post++ x) (post++ y))")
t_parse("++x - ++y", "(- (++ x) (++ y))")
#
# 1. x++ f() x[] left associative
# f(x)[1]++ means
# (++ (get (call f x) 1))
# 2. ++x + - ! ~ right associative
# -++x means (- (++ x))
def TestArrays(t_parse):
"""Shared between shell, oil, and Python."""
t_parse('x[1]', '(get x 1)')
t_parse('x[a+b]', '(get x (+ a b))')
def TestComma(t_parse):
t_parse('x=1,y=2,z=3', '(, (= x 1) (= y 2) (= z 3))')
def TestFuncCalls(t_parse):
t_parse('x = y(2)*3 + y(4)*5', '(= x (+ (* (call y 2) 3) (* (call y 4) 5)))')
t_parse('x(1,2)+y(3,4)', '(+ (call x 1 2) (call y 3 4))')
t_parse('x(a,b,c[d])', '(call x a b (get c d))')
t_parse('x(1,2)*j+y(3,4)*k+z(5,6)*l',
'(+ (+ (* (call x 1 2) j) (* (call y 3 4) k)) (* (call z 5 6) l))')
t_parse('print(test(2,3))', '(call print (call test 2 3))')
t_parse('print("x")', '(call print x)')
t_parse('min(255,n*2)', '(call min 255 (* n 2))')
t_parse('c = pal[i*8]', '(= c (get pal (* i 8)))')
def TestErrors(p):
_assertParseError(p, '}')
_assertParseError(p, ']')
_assertParseError(p, '{') # depends on language
_assertParseError(p, "x+1 = y", "Can't assign")
_assertParseError(p, "(x+1)++", "Can't assign")
# Should be an EOF error
_assertParseError(p, 'foo ? 1 :', 'Unexpected end')
_assertParseError(p, 'foo ? 1 ', 'expected :')
_assertParseError(p, '%', "can't be used in prefix position")
error_str = "can't be used in prefix"
_assertParseError(p, '}')
_assertParseError(p, '{')
_assertParseError(p, ']', error_str)
_assertParseError(p, '1 ( 2', "can't be called")
_assertParseError(p, '(x+1) ( 2 )', "can't be called")
#_assertParseError(p, '1 ) 2')
_assertParseError(p, '1 [ 2 ]', "can't be indexed")
def main():
t_parse = arith_parse.ParseShell
p = arith_parse.MakeParser
TestArith(t_parse)
TestBitwise(t_parse)
TestLogical(t_parse)
TestUnary(t_parse)
TestArrays(t_parse)
TestFuncCalls(t_parse)
TestComma(t_parse)
TestErrors(p)
if __name__ == '__main__':
main()
View
@@ -24,7 +24,49 @@
__all__ = [
'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor',
'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check']
'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check',
'is_simple']
# PATCH: Moved this function from asdl_c.py.
def is_simple(sum):
"""Return True if a sum is a simple.
A sum is simple if its types have no fields, e.g.
unaryop = Invert | Not | UAdd | USub
"""
for t in sum.types:
if t.fields:
return False
return True
class StrType:
pass
class IntType:
pass
# The type that should be passed
DESCRIPTORS_BY_NAME = {
'string': StrType(),
'int': IntType(),
}
class ArrayType:
def __init__(self, desc):
self.desc = desc
class MaybeType:
def __init__(self, desc):
self.desc = desc # another descriptor
# The following classes define nodes into which the ASDL description is parsed.
# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST
View
@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
asdl_tool.py
"""
import sys
import asdl
import arith_parse
import py_meta
import encode
def main(argv):
try:
action = argv[1]
except IndexError:
raise RuntimeError('Action required')
if action == 'py':
schema_path = argv[2]
module = asdl.parse(schema_path)
root = sys.modules[__name__]
py_meta.MakeTypes(module, root)
print(dir(root))
elif action == 'arith-encode':
expr = argv[2]
out_path = argv[3]
obj = arith_parse.ParseShell(expr)
#print(obj)
enc = encode.Params()
with open(out_path, 'wb') as f:
out = encode.BinOutput(f)
encode.EncodeRoot(obj, enc, out)
else:
raise RuntimeError('Invalid action %r' % action)
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print('FATAL: %s' % e, file=sys.stderr)
sys.exit(1)
View
@@ -0,0 +1,268 @@
#!/usr/bin/python
"""
encode.py
"""
import sys
import asdl
import py_meta
_DEFAULT_ALIGNMENT = 4
class BinOutput:
"""Write aligned blocks here. Keeps track of block indexes for refs."""
def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
self.f = f
# index of last block, to return as a ref.
self.last_block = 0
self.alignment = alignment
def WriteRootRef(self, chunk):
self.f.seek(5) # seek past 'OHP\x01\x04'
assert len(chunk) == 3
self.f.write(chunk)
def Write(self, chunk):
"""
Return a block pointer/index.
"""
# Input should be padded
a = self.alignment
assert len(chunk) % self.alignment == 0
self.f.write(chunk)
ref = self.last_block
num_blocks = len(chunk) // self.alignment # int division
#print('WROTE %d blocks' % num_blocks)
self.last_block += num_blocks
# Return a reference to the beginning
return ref
class Params:
"""Encoding parameters.
Hm most of these settings should be per-field, expressed in the schema. The
only global one is the ref/pointer alignment. 4 and 8 are the most likely
choices, and 4 is probably fine, because you have 64 MB of addressable memory
with 24 bit pointers.
"""
def __init__(self, alignment=_DEFAULT_ALIGNMENT):
self.alignment = alignment
self.pointer_type = 'uint32_t'
self.tag_width = 1 # for ArithVar vs ArithWord.
self.ref_width = 3 # 24 bits
self.int_width = 3 # 24 bits
# used for fd, line/col
# also I guess steuff like SimpleCommand
self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
self.max_int = 1 << (self.ref_width * 8)
self.max_index = 1 << (self.index_width * 8)
self.max_tag = 1 << (self.tag_width * 8)
def Tag(self, i, chunk):
if i > self.max_tag:
raise AssertionError('Invalid id %r' % i)
chunk.append(i & 0xFF)
def Int(self, n, chunk):
if n > self.max_int:
raise Error('%d is too big to fit in %d bytes' % (n, self.int_width))
for i in range(self.int_width):
chunk.append(n & 0xFF)
n >>= 8
def Ref(self, n, chunk):
self.Int(n, chunk)
def _Pad(self, chunk):
n = len(chunk)
a = self.alignment
if n % a != 0:
chunk.extend(b'\x00' * (a - (n % a)))
return chunk
# Right now all strings are references. Later they could be inline.
def Str(self, s, chunk):
# NOTE: For variable, proc, and function names, it could make sense to
# pre-compute and store a hash value. They will be looked up in the stack
# and so forth.
# - You could also return a obj number or object ID.
chunk.extend(s.encode('utf-8'))
chunk.append(0) # NUL terminator
def PaddedStr(self, s):
# NOTE:
# - The encoder could also have an intern table to save space.
# - Str and PaddedStr will both return char* ? Should we allow them to
# VARY with the same schema, is a value/ref type PART of the schema? It's
# basically small size optimization and "flexible array" optimization. I
# think you want that possibility.
chunk = bytearray()
self.Str(s, chunk)
return self._Pad(chunk)
def Bytes(self, buf, chunk):
n = len(buf)
if n >= self.max_index:
raise RuntimeError("bytes object is too long (%d)" % n)
for i in range(self.index_width):
chunk.append(n & 0xFF)
n >>= 8
chunk.extend(buf.encode('utf-8'))
def PaddedBytes(self, buf):
chunk = bytearray()
self.Bytes(buf, chunk)
return self._Pad(chunk)
def PaddedBlock(self, chunk):
return self._Pad(chunk)
def EncodeArray(obj_list, item_desc, enc, out):
"""
Args:
obj_list: List of Obj values
Returns:
ref
"""
array_chunk = bytearray()
enc.Int(len(obj_list), array_chunk) # Length prefix
if isinstance(item_desc, asdl.IntType):
for item in obj_list:
enc.Int(item, array_chunk)
elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
for item in obj_list:
enc.Int(item.enum_id, array_chunk)
else:
# A simple value is either an int, enum, or pointer. (Later: Iter<Str>
# might be possible for locality.)
assert isinstance(item_desc, asdl.Sum) or isinstance(
item_desc, asdl.Product), item_desc
# This is like vector<T*>
# Later:
# - Product types can be put in line
# - Sum types can even be put in line, if you have List<T> rather than
# Array<T>. Array implies O(1) random access; List doesn't.
for item in obj_list:
# Recursive call.
ref = EncodeObj(item, enc, out)
enc.Ref(ref, array_chunk)
this_ref = out.Write(enc.PaddedBlock(array_chunk))
return this_ref
def EncodeObj(obj, enc, out):
"""
Args:
obj: Obj to encode
enc: encoding params
out: output file
Returns:
ref: Reference to the last block
"""
# Algorithm: Depth first, post-order traversal. First obj is the first leaf.
# last obj is the root.
#
# Array is a homogeneous type.
this_chunk = bytearray()
assert isinstance(obj, py_meta.CompoundObj), \
'%s is not a compound obj (%r)' % (obj, obj.__class__)
if isinstance(obj.DESCRIPTOR, asdl.Constructor):
enc.Tag(obj.tag, this_chunk)
for name in obj.FIELDS: # encode in order
desc = obj.DESCRIPTOR_LOOKUP[name]
#print('\n\n------------')
#print('field DESC', name, desc)
field_val = getattr(obj, name)
#print('VALUE', field_val)
# TODO:
# - Float would be inline, etc.
# - Optional value: write all enc.Ref(0)? This is 'nullptr'.
# - Repeated value: write them all adjacent to each other?
# INLINE
if isinstance(desc, asdl.IntType):
enc.Int(field_val, this_chunk)
elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
# Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
# byte for most enums.
enc.Int(field_val.enum_id, this_chunk)
# Write variable length field first, assuming that it's a ref/pointer.
# TODO: allow one inline, hanging string or array per record.
elif isinstance(desc, asdl.StrType):
ref = out.Write(enc.PaddedStr(field_val))
enc.Ref(ref, this_chunk)
elif isinstance(desc, asdl.ArrayType):
item_desc = desc.desc
ref = EncodeArray(field_val, item_desc, enc, out)
enc.Ref(ref, this_chunk)
elif isinstance(desc, asdl.MaybeType):
item_desc = desc.desc
ok = False
if isinstance(item_desc, asdl.Sum):
if not asdl.is_simple(item_desc):
ok = True
elif isinstance(item_desc, asdl.Product):
ok = True
if not ok:
raise AssertionError(
"Currently not encoding simple optional types: %s", field_val)
if field_val is None:
enc.Ref(0, this_chunk)
else:
ref = EncodeObj(field_val, enc, out)
enc.Ref(ref, this_chunk)
else:
# Recursive call for child records. Write children before parents.
ref = EncodeObj(field_val, enc, out)
enc.Ref(ref, this_chunk)
# Write the parent record
this_ref = out.Write(enc.PaddedBlock(this_chunk))
return this_ref
def EncodeRoot(obj, enc, out):
ref = out.Write(b'OHP\x01') # header, version 1
assert ref == 0
# 4-byte alignment, then 3 byte placeholder for the root ref.
ref = out.Write(b'\4\0\0\0')
assert ref == 1
root_ref = EncodeObj(obj, enc, out)
chunk = bytearray()
enc.Ref(root_ref, chunk)
out.WriteRootRef(chunk)
#print("Root obj ref:", root_ref)
View
@@ -0,0 +1,32 @@
#!/usr/bin/python -S
"""
encode_test.py: Tests for encode.py
"""
import unittest
import encode # module under test
class EncoderTest(unittest.TestCase):
def testEncoder(self):
p = encode.Params(16)
chunk = bytearray()
p.Int(1, chunk)
self.assertEqual(b'\x01\x00\x00', chunk)
chunk = p.PaddedBytes('0123456789')
# 2 byte length -- max 64K entries
self.assertEqual(b'\x0A\x000123456789\x00\x00\x00\x00', bytes(chunk))
chunk = p.PaddedStr('0123456789')
# 2 byte length -- max 64K entries
self.assertEqual(b'0123456789\x00\x00\x00\x00\x00\x00', bytes(chunk))
#p.Block([b'a', b'bc'])
if __name__ == '__main__':
unittest.main()
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -0,0 +1,109 @@
-- ASDL's six builtin types are identifier, int, string, bytes, object, singleton
-- NOTE we're not using identifier/object/singleton/bytes
-- Python only uses bytes/object/singleton once.
-- identifier is used all over. Why? I have different rules for
-- for functions and vars. case/for/assign have vars.
-- TODO:
-- How to encode position information?
-- FuncCall inside ${}, in addition to arithmetic context?
-- Well I guess you can do this: echo $(( f(x,y) ))a. It is a little more
-- annoying.
module osh
{
source_location = (string path, int line, int col, int length)
token = (string value, source_location loc)
id = Foo | Bar -- TODO: you need a placeholder to say this is generated
-- by another tool. Suppress the error.
-- TODO: generate the following:
-- arith_op_id, bool_op_id, vpre_op_id, array_op_id, vpost_op_id
bracket_op =
ArrayOp(id op_id) -- * or @
| ArrayIndex(arith_expr a)
suffix_op =
VarUnary(id op_id, word arg)
| VarReplace(word pat, word? replace)
| VarSlice(arith_expr start, arith_expr? len)
word_part =
ArrayLiteralPart(word* words)
| LiteralPart(token t)
| EscapedLiteralPart(token t)
| DoubleQuotedPart(word_part* parts)
| VarSubPart(string name,
id prefix_op,
bracket_op bracket_op
suffix_op suffix_op)
| TildeSubPart(string prefix)
| CommandSubPart(command c)
| ArithSubPart(arith_expr a)
-- NOTE: Could put | Token(token t) as an optimization.
word = Word(word_part* parts)
-- NOTE: Could put | Token(token t) as an optimization.
arith_expr =
ArithVar(string name) -- eval variable
| ArithWord(word w) -- word to be evaluated as a constant
| ArithUnary(id op_id, arith_expr a)
| ArithBinary(id op_id, arith_expr left, arith_expr right)
| TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
| FuncCall(arith_expr func, arith_expr* args)
bool_expr =
BoolBinary(word left, word right)
| BoolUnary(word child)
| LogicalNot(bool_expr b)
| LogicalAnd(bool_expr left, bool_expr right)
| LogicalOr(bool_expr left, bool_expr right)
-- NOTE: To reprint the here doc, I guess we need the whole delimiter? And
-- then do_expansion is calculated from that.
redir =
HereDoc(id op_id, word arg_word, int fd, int do_expansion)
| Redirect(id op_id, word arg_word, int fd)
lvalue =
LeftVar(string name)
| LeftIndex(string name, arith_expr index)
scope = Global | Local
var_flags = Export | Readonly
binding = (lvalue lhs, word rhs)
and_or = DAmp | DPipe
-- |& in osh; |- in oil.
pipe_op = Pipe | PipeAndStderr
case_arm = (word* pat, command* action)
command =
NoOp
| SimpleCommand(word* words, redir* redirects, binding* more_env)
| Assignment(scope scope,
var_flags flags,
word* names, -- names mentioned without a binding
binding* bindings)
| DParen(arith_expr a)
| DBracket(bool_expr b)
| Block(command* commands)
| Subshell(command* commands)
| Fork(command* commands) -- shell only allows one command
| Pipeline(command* commands, int negated, pipe_op* op)
| AndOr(command* commands, and_or ops)
-- NOTE: Can't have multiple var length? Maybe it's just a single command.
| ForEach(string var, word* words, command* body)
| ForExpr(arith_expr init, arith_expr test, arith_expr update, command* body)
-- NOTE: in oil, we will have expression variants?i
| While(command cond, command* body)
| Until(command cond, command* body)
| If(command test, command* body, command* orelse)
| Case(string var_name, case_arm* cases)
| FuncDef(string name, command* body)
}
View
@@ -0,0 +1,52 @@
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include "osh.asdl.h"
// Returns the root ref, or -1 for invalid
int GetRootRef(uint8_t* image) {
if (image[0] != 'O') return -1;
if (image[1] != 'H') return -1;
if (image[2] != 'P') return -1;
if (image[3] != 4) return -1;
return image[4] + (image[5] << 8) + (image[6] << 16) + (image[7] << 24);
}
int main(int argc, char **argv) {
if (argc == 0) {
printf("Expected filename\n");
return 1;
}
FILE *f = fopen(argv[1], "rb");
if (!f) {
printf("Error opening %s", argv[1]);
return 1;
}
fseek(f, 0, SEEK_END);
size_t num_bytes = ftell(f);
fseek(f, 0, SEEK_SET); //same as rewind(f);
uint8_t* image = static_cast<uint8_t*>(malloc(num_bytes + 1));
fread(image, num_bytes, 1, f);
fclose(f);
image[num_bytes] = 0;
printf("Read %zu bytes\n", num_bytes);
int root_ref = GetRootRef(image);
if (root_ref == -1) {
printf("Invalid image\n");
return 1;
}
// Hm we could make the root ref be a BYTE offset?
int alignment = 4;
printf("alignment: %d root: %d\n", alignment, root_ref);
auto base = reinterpret_cast<uint32_t*>(image);
size_t offset = alignment * root_ref;
auto expr = reinterpret_cast<arith_expr_t*>(image + offset);
//PrintExpr(base, *expr, 0);
}
View
@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""
py_meta.py
Parse an ASDL file, and generate Python classes using metaprogramming.
All objects descends from Obj, which allows them to be dynamically type-checked
and serialized. Objects hold type descriptors, which are defined in asdl.py.
Usage:
from osh import ast
n1 = ast.ArithVar()
n2 = ast.ArrayLiteralPart()
API Notes:
The Python AST module doesn't make any distinction between simple and compound
sum types. (Simple types have no constructors with fields.)
C++ has to make this distinction for reasons of representation. It's more
efficient to hold an enum value than a pointer to a class with an enum value.
In Python I guess that's not quite true.
So in order to serialize the correct bytes for C++, our Python metaclass
implementation has to differ from what's generated by asdl_c.py. More simply
put: an op is Add() and not Add, an instance of a class, not an integer value.
"""
import sys
import pprint
import asdl
def _CheckType(value, expected_desc):
"""Is value of type expected_desc?
Args:
value: Obj or primitive type
expected_desc: instance of asdl.Product, asl.Sum, asdl.StrType,
asdl.IntType, ArrayType, MaybeType, etc.
"""
if isinstance(expected_desc, asdl.MaybeType):
if value is None:
return True
return _CheckType(value, expected_desc.desc)
if isinstance(expected_desc, asdl.ArrayType):
if not isinstance(value, list):
return False
# Now check all entries
for item in value:
if not _CheckType(item, expected_desc.desc):
return False
return True
if isinstance(expected_desc, asdl.StrType):
return isinstance(value, str)
if isinstance(expected_desc, asdl.IntType):
return isinstance(value, int)
try:
actual_desc = value.__class__.DESCRIPTOR
except AttributeError:
return False # it's not of the right type
if isinstance(expected_desc, asdl.Product):
return actual_desc is expected_desc
if isinstance(expected_desc, asdl.Sum):
if asdl.is_simple(expected_desc):
return actual_desc is expected_desc
else:
for cons in expected_desc.types:
#print("CHECKING desc %s against %s" % (desc, cons))
# It has to be one of the alternatives
if actual_desc is cons:
return True
return False
class Obj:
# NOTE: We're using CAPS for these static fields, since they are constant at
# runtime after metaprogramming.
DESCRIPTOR = None # Used for type checking
class SimpleObj(Obj):
"""An enum value.
Other simple objects: int, str, maybe later a float.
"""
def __init__(self, enum_id, name):
self.enum_id = enum_id
self.name = name
def __repr__(self):
return '<%s %s %s>' % (self.__class__.__name__, self.name, self.enum_id)
class CompoundObj(Obj):
"""A compound object with fields, e.g. a Product or Constructor.
Uses some metaprogramming.
"""
FIELDS = [] # ordered list of field names
DESCRIPTOR_LOOKUP = {} # field name: (asdl.Type | int | str)
def __init__(self, *args, **kwargs):
# The user must specify ALL required fields or NONE.
self._assigned = {f: False for f in self.FIELDS}
if args or kwargs:
self._Init(args, kwargs)
else:
# Set defaults here?
pass
def _Init(self, args, kwargs):
for i, val in enumerate(args):
name = self.FIELDS[i]
self._assigned[name] = True
self.__setattr__(name, val)
for name, val in kwargs.items():
if self._assigned[name]:
raise AssertionError('Duplicate assignment of field %r' % name)
self._assigned[name] = True
self.__setattr__(name, val)
for name in self.FIELDS:
if not self._assigned[name]:
#print("%r wasn't assigned" % name)
desc = self.DESCRIPTOR_LOOKUP[name]
if isinstance(desc, asdl.MaybeType):
# item_desc = desc.desc
self.__setattr__(name, None) # Maybe values can be None
else:
# If anything was set, then required fields raise an error.
raise ValueError("Field %r is required and wasn't initialized" % name)
def CheckUnassigned(self):
"""See if there are unassigned fields, for later encoding."""
unassigned = []
for name in self.FIELDS:
if not self._assigned[name]:
desc = self.DESCRIPTOR_LOOKUP[name]
if not isinstance(desc, asdl.MaybeType):
unassigned.append(name)
if unassigned:
raise ValueError("Fields %r were't be assigned" % unassigned)
def __setattr__(self, name, value):
if name == '_assigned':
self.__dict__[name] = value
return
desc = self.DESCRIPTOR_LOOKUP[name]
if not _CheckType(value, desc):
raise AssertionError("Field %r should be of type %s, got %r" %
(name, desc, value))
self._assigned[name] = True # check this later when encoding
self.__dict__[name] = value
def __repr__(self):
return '<%s %s>' % (self.__class__.__name__, pprint.pformat(self.__dict__))
def _MakeFieldDescriptors(module, fields):
desc_lookup = {}
for f in fields:
# look up type by name
primitive_desc = asdl.DESCRIPTORS_BY_NAME.get(f.type)
desc = primitive_desc or module.types[f.type]
# It's either a primitive type or sum type
if primitive_desc is None:
assert (isinstance(desc, asdl.Sum) or
isinstance(desc, asdl.Product)), desc
# Wrap descriptor here. Then we can type check.
# And then encode too.
assert not (f.opt and f.seq), f
if f.opt:
desc = asdl.MaybeType(desc)
if f.seq:
desc = asdl.ArrayType(desc)
desc_lookup[f.name] = desc
class_attr = {
'FIELDS': [f.name for f in fields],
'DESCRIPTOR_LOOKUP': desc_lookup,
}
return class_attr
def MakeTypes(module, root):
"""
Args:
module: asdl.Module
root: an object/package to add types to
"""
for defn in module.dfns:
typ = defn.value
#print('TYPE', defn.name, typ)
if isinstance(typ, asdl.Sum):
if asdl.is_simple(typ):
# An object without fields, which can be stored inline.
class_attr = {'DESCRIPTOR': typ} # asdl.Sum
cls = type(defn.name, (SimpleObj, ), class_attr)
#print('CLASS', cls)
setattr(root, defn.name, cls)
for i, cons in enumerate(typ.types):
enum_id = i + 1
name = cons.name
val = cls(enum_id, cons.name) # Instantiate SimpleObj subtype
# Set a static attribute like op_id.Plus, op_id.Minus.
setattr(cls, name, val)
else:
# e.g. for arith_expr
base_class = type(defn.name, (CompoundObj, ), {})
setattr(root, defn.name, base_class)
# Make a type for each alternative.
for i, cons in enumerate(typ.types):
class_attr = _MakeFieldDescriptors(module, cons.fields)
class_attr['DESCRIPTOR'] = cons
# TODO: Allow setting these integers. We're reusing ID 0 for every
# sum type, but that's OK because fields are strongly typed.
class_attr['tag'] = i + 1 # zero reserved?
cls = type(cons.name, (base_class, ), class_attr)
setattr(root, cons.name, cls)
elif isinstance(typ, asdl.Product):
class_attr = _MakeFieldDescriptors(module, typ.fields)
class_attr['DESCRIPTOR'] = typ
cls = type(defn.name, (CompoundObj, ), class_attr)
setattr(root, defn.name, cls)
else:
raise AssertionError(typ)
View
@@ -0,0 +1,16 @@
#!/usr/bin/env python3
"""
py_meta_test.py: Tests for py_meta.py
"""
import re
import unittest
import py_meta # module under test
class AsdlTest(unittest.TestCase):
pass
if __name__ == '__main__':
unittest.main()
View
@@ -0,0 +1,219 @@
#!/bin/bash
#
# Automation for ASDL.
#
# Usage:
# ./run.sh <function name>
set -o nounset
set -o pipefail
set -o errexit
# Run unit tests.
unit() {
asdl/arith_ast_test.py
asdl/py_meta_test.py
asdl/encode_test.py
return
for t in asdl/*_test.py; do
echo -----
echo $t
echo -----
$t
done
}
asdl-arith-encode() {
local expr="$1"
local out=${2:-_tmp/arith.bin}
asdl/asdl_demo.py arith-encode "$expr" $out
ls -l $out
hexdump $out
}
asdl-py() {
local schema=$1
asdl/asdl_demo.py py $schema
}
asdl-cpp() {
local schema=${1:-asdl/arith.asdl}
local src=${2:-_tmp/arith.asdl.h}
asdl/gen_cpp.py cpp $schema > $src
ls -l $src
wc -l $src
}
py-cpp() {
local schema=${1:-asdl/arith.asdl}
asdl-py $schema
asdl-cpp $schema _tmp/$(basename $schema).h
}
#
# Test specific schemas
#
arith-both() {
py-cpp asdl/arith.asdl
}
osh-both() {
py-cpp asdl/osh.asdl
}
#
# Native Code
#
readonly CLANG=~/install/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/bin/clang++
cxx() {
#local CXX=c++
local CXX=$CLANG
local opt_flag='-O2'
local opt_flag='-O0'
# -Winline
# http://stackoverflow.com/questions/10631283/how-will-i-know-whether-inline-function-is-actually-replaced-at-the-place-where
$CXX -Winline $opt_flag -std=c++11 "$@"
}
# http://www.commandlinefu.com/commands/view/6004/print-stack-trace-of-a-core-file-without-needing-to-enter-gdb-interactively
# http://stackoverflow.com/questions/4521015/how-to-pass-arguments-and-redirect-stdin-from-a-file-to-program-run-in-gdb
gdb-trace() {
# -args goes before the executable
gdb -batch -ex "run" -ex "bt" -args "$@" 2>&1
}
build-demo() {
local name=$1
local schema=asdl/${name}.asdl
# Generate C++ code
asdl-cpp $schema _tmp/${name}.asdl.h
local bin=_tmp/${name}_demo
cxx -I _tmp -o $bin asdl/${name}_demo.cc
chmod +x $bin
}
arith-demo() {
local name=arith
local data=_tmp/${name}.bin
# Write a binary
asdl-arith-encode '7 + 9' $data
local bin=_tmp/${name}_demo
build-demo $name $bin
set -x
gdb-trace $bin $data
#$bin $data
}
osh-demo() {
build-demo osh
}
a2() {
local data=_tmp/a2.bin
asdl-arith-encode 'foo + 99 - f(1,2,3+4) * 123' $data
_tmp/arith_demo $data
}
a3() {
local data=_tmp/a3.bin
asdl-arith-encode 'g(x,2)' $data
gdb-trace _tmp/arith_demo $data
}
a4() {
local data=_tmp/a4.bin
asdl-arith-encode 'array[99]' $data
gdb-trace _tmp/arith_demo $data
asdl-arith-encode 'array[5:10] * 5' $data
gdb-trace _tmp/arith_demo $data
}
# http://stackoverflow.com/questions/22769246/disassemble-one-function-using-objdump
# It would be nice to disassemble a single function.
disassemble() {
local opt_flag=${1:-'-O0'}
local out=_tmp/arith_demo$opt_flag.S
$CLANG -std='c++11' $opt_flag -I _tmp -o $out -S \
-mllvm --x86-asm-syntax=intel asdl/arith_demo.cc
#cat $out
}
llvm() {
local opt_flag=${1:-'-O0'}
local out=_tmp/arith_demo$opt_flag.ll
$CLANG -std='c++11' $opt_flag -I _tmp -o $out -S \
-emit-llvm asdl/arith_demo.cc
#cat $out
}
# With -O0, you can see all the functions. With -O2, they ARE inlined.
objdump-arith() {
# NOTE: This doesn't take into account different optimization levels
objdump -d _tmp/arith_demo | grep '^0'
}
# https://sourceware.org/ml/binutils/2010-04/msg00447.html
# http://stackoverflow.com/questions/4274804/query-on-ffunction-section-fdata-sections-options-of-gcc
# Hm you can force a function. Write it inline with arith_demo.cc then.
# TODO: Is there a pattern we can grep for to test if ANY accessor was NOT
# inlined? Demangle names I guess.
nm-arith() {
nm _tmp/arith_demo
}
opt-stats() {
wc -l _tmp/*.S
echo
wc -l _tmp/*.ll
echo
md5sum _tmp/*.S
echo
md5sum _tmp/*.ll
}
compare-opts() {
# http://stackoverflow.com/questions/15548023/clang-optimization-levels
# Says -Os is identical to -O2? But not according to my test!
for opt in -Os -O0 -O1 -O2 -O3 -O4; do
echo $opt
disassemble $opt
llvm $opt
done
opt-stats
}
count() {
wc -l asdl/{asdl,py_meta,gen_cpp,encode}.py
echo
wc -l asdl/{py_meta,encode}_test.py
echo
wc -l asdl/arith_parse*.py asdl/tdop.py asdl/arith_ast.py asdl/asdl_demo.py
echo
wc -l asdl/*.cc
echo
wc -l asdl/*.asdl
echo
}
"$@"
View
@@ -0,0 +1,230 @@
#!/usr/bin/python3
"""
tdop.py
"""
import re
class ParseError(Exception):
pass
#
# Default parsing functions give errors
#
def NullError(p, token, bp):
raise ParseError("%s can't be used in prefix position" % token)
def LeftError(p, token, left, rbp):
# Hm is this not called because of binding power?
raise ParseError("%s can't be used in infix position" % token)
#
# Input
#
class Token:
def __init__(self, type, val, loc=None):
self.type = type
self.val = val
def __repr__(self):
return '<Token %s %s>' % (self.type, self.val)
#
# Using the pattern here: http://effbot.org/zone/xml-scanner.htm
#
# NOTE: () and [] need to be on their own so (-1+2) works
TOKEN_RE = re.compile("""
\s* (?: (\d+) | (\w+) | ( [\-\+\*/%!~<>=&^|?:,]+ ) | ([\(\)\[\]]) )
""", re.VERBOSE)
def Tokenize(s):
for item in TOKEN_RE.findall(s):
if item[0]:
typ = 'number'
val = int(item[0])
elif item[1]:
typ = 'name'
val = item[1]
elif item[2]:
typ = item[2]
val = item[2]
elif item[3]:
typ = item[3]
val = item[3]
yield Token(typ, val, loc=(0, 0))
#
# Simple and Composite AST nodes
#
class Node(object):
def __init__(self, token):
"""
Args:
type: token type (operator, etc.)
val: token val, only important for number and string
"""
self.token = token
def __repr__(self):
return str(self.token.val)
class CompositeNode(Node):
def __init__(self, token, children):
"""
Args:
type: token type (operator, etc.)
"""
Node.__init__(self, token)
self.children = children
def __repr__(self):
args = ''.join([" " + repr(c) for c in self.children])
return "(" + self.token.type + args + ")"
#
# Parser definition
#
class LeftInfo(object):
"""Row for operator.
In C++ this should be a big array.
"""
def __init__(self, led=None, lbp=0, rbp=0):
self.led = led or LeftError
self.lbp = lbp
self.rbp = rbp
class NullInfo(object):
"""Row for operator.
In C++ this should be a big array.
"""
def __init__(self, nud=None, bp=0):
self.nud = nud or NullError
self.bp = bp
class ParserSpec(object):
"""Specification for a TDOP parser."""
def __init__(self):
self.null_lookup = {}
self.left_lookup = {}
def Null(self, bp, nud, tokens):
"""Register a token that doesn't take anything on the left.
Examples: constant, prefix operator, error.
"""
for token in tokens:
self.null_lookup[token] = NullInfo(nud=nud, bp=bp)
if token not in self.left_lookup:
self.left_lookup[token] = LeftInfo() # error
def _RegisterLed(self, lbp, rbp, led, tokens):
for token in tokens:
if token not in self.null_lookup:
self.null_lookup[token] = NullInfo(NullError)
self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led)
def Left(self, bp, led, tokens):
"""Register a token that takes an expression on the left."""
self._RegisterLed(bp, bp, led, tokens)
def LeftRightAssoc(self, bp, led, tokens):
"""Register a right associative operator."""
self._RegisterLed(bp, bp-1, led, tokens)
def LookupNull(self, token):
"""Get the parsing function and precedence for a null position token."""
try:
nud = self.null_lookup[token]
except KeyError:
raise ParseError('Unexpected token %r' % token)
return nud
def LookupLeft(self, token):
"""Get the parsing function and precedence for a left position token."""
try:
led = self.left_lookup[token]
except KeyError:
raise ParseError('Unexpected token %r' % token)
return led
EOF_TOKEN = Token('eof', 'eof')
class Parser(object):
"""Recursive TDOP parser."""
def __init__(self, spec, lexer):
self.spec = spec
self.lexer = lexer # iterable
self.token = None # current token
def AtToken(self, token_type):
"""Test if we are looking at a token."""
return self.token.type == token_type
def Next(self):
"""Move to the next token."""
try:
t = self.lexer.__next__()
except StopIteration:
t = EOF_TOKEN
self.token = t
def Eat(self, val):
"""Assert the value of the current token, then move to the next token."""
if val and not self.AtToken(val):
raise ParseError('expected %s, got %s' % (val, self.token))
self.Next()
def ParseUntil(self, rbp):
"""
Parse to the right, eating tokens until we encounter a token with binding
power LESS THAN OR EQUAL TO rbp.
"""
if self.AtToken('eof'):
raise ParseError('Unexpected end of input')
t = self.token
self.Next() # skip over the token, e.g. ! ~ + -
null_info = self.spec.LookupNull(t.type)
node = null_info.nud(self, t, null_info.bp)
while True:
t = self.token
left_info = self.spec.LookupLeft(t.type)
# Examples:
# If we see 1*2+ , rbp = 27 and lbp = 25, so stop.
# If we see 1+2+ , rbp = 25 and lbp = 25, so stop.
# If we see 1**2**, rbp = 26 and lbp = 27, so keep going.
if rbp >= left_info.lbp:
break
self.Next() # skip over the token, e.g. / *
node = left_info.led(self, t, node, left_info.rbp)
return node
def Parse(self):
self.Next()
return self.ParseUntil(0)