Skip to content

Commit

Permalink
[j8] Sketch a more efficient encoding interface
Browse files Browse the repository at this point in the history
Fix escaping that caused parse errors.
  • Loading branch information
Andy Chu committed Dec 25, 2023
1 parent 4e3072c commit 984fbc2
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 33 deletions.
5 changes: 3 additions & 2 deletions data_lang/j8.py
Expand Up @@ -243,18 +243,19 @@ def Print(self, val, level=0):

elif case(value_e.Int):
val = cast(value.Int, UP_val)
# TODO: buf.write_int() would avoid allocation
# TODO: use pyj8.WriteInt(val.i, self.buf)
self.buf.write(str(val.i))

elif case(value_e.Float):
val = cast(value.Float, UP_val)

# TODO: buf.write_float() would avoid allocation
# TODO: use pyj8.WriteFloat(val.f, self.buf)
self.buf.write(str(val.f))

elif case(value_e.Str):
val = cast(value.Str, UP_val)

# TODO: pyj8.WriteString(val.s, self.buf)
self._StringToBuf(val.s)

elif case(value_e.List):
Expand Down
41 changes: 16 additions & 25 deletions data_lang/pyj8.py
Expand Up @@ -12,39 +12,30 @@
_ = log


def Replace(e):
# type: (UnicodeDecodeError) -> Tuple[unicode, int]

#print('ZZ', e)
#raise e
#print(dir(e))

#print('OBJ', e.object)

# Can we record these positions somewhere?

# A byte string can be alternating slices of valid unicode ranges an
# invalid unicode ranges?
# So we're doing recovery?

print('%d %d' % (e.start, e.end))

return (u'ZZ', e.end)
def WriteInt(i, buf):
# type: (int, mylib.BufWriter) -> None
"""
C++ version can avoid allocation
"""
buf.write(str(i))


# This allows us to return Unicode, not what we want
# Replace() has a weird type
# codecs.register_error('j8', Replace) # type: ignore
def WriteFloat(f, buf):
# type: (float, mylib.BufWriter) -> None
"""
C++ version can avoid allocation
"""
buf.write(str(f))


def Enc(s, options):
def EncodeString(s, options):
# type: (str, int) -> str
buf = mylib.BufWriter()
Encode(s, options, buf)
WriteString(s, options, buf)
return buf.getvalue()


def Encode(s, options, buf):
def WriteString(s, options, buf):
# type: (str, int, mylib.BufWriter) -> int
"""
Callers:
Expand Down Expand Up @@ -88,7 +79,7 @@ def Encode(s, options, buf):
J8 mode:
Prefer literal UTF-8
Escaping mode to use j"\u{123456}" and perhaps b"\u{123456} when there
Escaping mode to use j"\\u{123456}" and perhaps b"\\u{123456} when there
are also errors
= mode:
Expand Down
12 changes: 6 additions & 6 deletions data_lang/pyj8_test.py
Expand Up @@ -12,26 +12,26 @@
class J8Test(unittest.TestCase):

def testEncode(self):
en = pyj8.Enc('hello', 0)
en = pyj8.EncodeString('hello', 0)
print(en)

en = pyj8.Enc('\xff-\xfe-\xff-\xfe', 0)
en = pyj8.EncodeString('\xff-\xfe-\xff-\xfe', 0)
print(en)

# multiple errrors
en = pyj8.Enc('hello\xffthere \xfe\xff gah', 0)
en = pyj8.EncodeString('hello\xffthere \xfe\xff gah', 0)
print(en)

# valid mu
en = pyj8.Enc('hello \xce\xbc there', 0)
en = pyj8.EncodeString('hello \xce\xbc there', 0)
print(en)

# two first bytes - invalid
en = pyj8.Enc('hello \xce\xce there', 0)
en = pyj8.EncodeString('hello \xce\xce there', 0)
print(en)

# two cont bytes - invalid
en = pyj8.Enc('hello \xbc\xbc there', 0)
en = pyj8.EncodeString('hello \xbc\xbc there', 0)
print(en)


Expand Down

0 comments on commit 984fbc2

Please sign in to comment.