Permalink
Browse files

All 10 files in benchmark/testdata can now be oheap-encoded.

- Use const.NO_INTEGER for more span IDs.

- Don't encode to utf-8 while serializing to oheap.  The encoder should
  just pass bytes straight through.

The issue is that the lexer will read individual bytes of utf-8
characters and split them into Lit_Other tokens.  This is fine since
normally we just re-concatenate the bytes.

But it's not OK to encode them one at a time!

Make note of a possible fix in osh/lex.py and spec/unicode.sh.
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 26, 2017
1 parent 10c0897 commit de19b3d04aa6ac6b63a86e51100d275f919b4459
Showing with 46 additions and 26 deletions.
  1. +23 −15 asdl/encode.py
  2. +9 −9 osh/cmd_parse.py
  3. +2 −0 osh/lex.py
  4. +4 −2 osh/word_parse.py
  5. +8 −0 spec/unicode.sh
View
@@ -8,9 +8,17 @@
from asdl import py_meta
from asdl import const
_DEFAULT_ALIGNMENT = 4
from core import util
class EncodeError(Exception):
def __init__(self, *args, **kwargs):
Exception.__init__(self, *args, **kwargs)
self.details_printed = False
_DEFAULT_ALIGNMENT = 4
class BinOutput:
"""Write aligned blocks here. Keeps track of block indexes for refs."""
@@ -75,10 +83,11 @@ def Tag(self, i, chunk):
def Int(self, n, chunk):
if n < 0:
raise RuntimeError(
raise EncodeError(
"ASDL can't currently encode negative numbers. Got %d" % n)
if n > self.max_int:
raise RuntimeError('%d is too big to fit in %d bytes' % (n, self.int_width))
raise EncodeError(
'%d is too big to fit in %d bytes' % (n, self.int_width))
for i in range(self.int_width):
chunk.append(n & 0xFF)
@@ -101,7 +110,7 @@ def Str(self, s, chunk):
# pre-compute and store a hash value. They will be looked up in the stack
# and so forth.
# - You could also return a obj number or object ID.
chunk.extend(s.encode('utf-8'))
chunk.extend(s)
chunk.append(0) # NUL terminator
def PaddedStr(self, s):
@@ -118,11 +127,11 @@ def PaddedStr(self, s):
def Bytes(self, buf, chunk):
n = len(buf)
if n >= self.max_index:
raise RuntimeError("bytes object is too long (%d)" % n)
raise EncodeError("bytes object is too long (%d)" % n)
for i in range(self.index_width):
chunk.append(n & 0xFF)
n >>= 8
chunk.extend(buf.encode('utf-8'))
chunk.extend(buf)
def PaddedBytes(self, buf):
chunk = bytearray()
@@ -170,12 +179,12 @@ def EncodeArray(obj_list, item_desc, enc, out):
# - Sum types can even be put in line, if you have List<T> rather than
# Array<T>. Array implies O(1) random access; List doesn't.
for item in obj_list:
# Recursive call.
from core import util
try:
ref = EncodeObj(item, enc, out)
except RuntimeError as e:
util.log("Error encoding array: %s (item %s)", e, item)
except EncodeError as e:
if not e.details_printed:
util.log("Error encoding array: %s (item %s)", e, item)
e.details_printed = True
raise
enc.Ref(ref, array_chunk)
@@ -253,13 +262,12 @@ def EncodeObj(obj, enc, out):
if is_maybe and field_val is None:
enc.Ref(0, this_chunk)
else:
# Recursive call for CompoundObj children. Write children before
# parents.
from core import util
try:
ref = EncodeObj(field_val, enc, out)
except RuntimeError as e:
util.log("Error encoding %s : %s (val %s)", name, e, field_val)
except EncodeError as e:
if not e.details_printed:
util.log("Error encoding %s : %s (val %s)", name, e, field_val)
e.details_printed = True
raise
enc.Ref(ref, this_chunk)
View
@@ -673,11 +673,11 @@ def ParseForWords(self):
;
"""
words = []
semi_spid = -1 # The span_id of any semi-colon, so we can remove it.
# The span_id of any semi-colon, so we can remove it.
semi_spid = const.NO_INTEGER
while True:
if not self._Peek(): return None
# TODO: Add span ID of semi-colon or -1
if self.c_id == Id.Op_Semi:
semi_spid = self.cur_word.token.span_id # TokenWord
self._Next()
@@ -745,8 +745,8 @@ def _ParseForEachLoop(self):
if not self._NewlineOk(): return None
in_spid = -1
semi_spid = -1
in_spid = const.NO_INTEGER
semi_spid = const.NO_INTEGER
if not self._Peek(): return None
if self.c_id == Id.KW_In:
@@ -862,8 +862,8 @@ def ParseCaseItem(self):
else:
action_children = []
dsemi_spid = -1
last_spid = -1
dsemi_spid = const.NO_INTEGER
last_spid = const.NO_INTEGER
if not self._Peek(): return None
if self.c_id == Id.KW_Esac:
last_spid = word.LeftMostSpanForWord(self.cur_word)
@@ -965,7 +965,7 @@ def _ParseElifElse(self, if_node):
if not body: return None
if_node.else_action = body.children
else:
else_spid = -1
else_spid = const.NO_INTEGER
if_node.spids.append(else_spid)
@@ -988,14 +988,14 @@ def ParseIf(self):
if not body: return None
arm = ast.if_arm(cond.children, body.children)
arm.spids.extend((-1, then_spid)) # no if spid at first?
arm.spids.extend((const.NO_INTEGER, then_spid)) # no if spid at first?
if_node.arms.append(arm)
if self.c_id in (Id.KW_Elif, Id.KW_Else):
if not self._ParseElifElse(if_node):
return None
else:
if_node.spids.append(-1) # no else spid
if_node.spids.append(const.NO_INTEGER) # no else spid
fi_spid = word.LeftMostSpanForWord(self.cur_word)
if not self._Eat(Id.KW_Fi): return None
View
@@ -156,6 +156,8 @@
]
_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
# NOTE: We could add anything 128 and above to this character class? So
# utf-8 characters don't get split?
R(r'[a-zA-Z0-9_/.-]+', Id.Lit_Chars),
# e.g. beginning of NAME=val, which will always be longer than the above
# Id.Lit_Chars.
View
@@ -9,6 +9,8 @@
word_parse.py - Parse the shell word language.
"""
from asdl import const
from core.id_kind import Id, Kind, LookupKind
from core import braces
from core import word
@@ -624,8 +626,8 @@ def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
Also ${foo%%a b c} # treat this as double quoted. until you hit
"""
quoted_part = ast.DoubleQuotedPart()
left_spid = -1
right_spid = -1 # gets set later
left_spid = const.NO_INTEGER
right_spid = const.NO_INTEGER # gets set later
if self.cur_token is not None: # None in here doc case
left_spid = self.cur_token.span_id
View
@@ -93,4 +93,12 @@ print(repr(json.loads(sys.stdin.read())))
'
}
# Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc'). This is
# fine for most purposes, although we could probably simplify this.
osh-literal() {
bin/osh -n -c 'echo [μ]'
# This works fine
bin/osh -c 'echo [μ]'
}
"$@"

0 comments on commit de19b3d

Please sign in to comment.