[j8] Sketch a more efficient encoding interface

Fix escaping that caused parse errors.
oilshell · Dec 25, 2023 · 984fbc2 · 984fbc2
1 parent 4e3072c
commit 984fbc2
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 33 deletions.
diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -243,18 +243,19 @@ def Print(self, val, level=0):
 
             elif case(value_e.Int):
                 val = cast(value.Int, UP_val)
-                # TODO: buf.write_int() would avoid allocation
+                # TODO: use pyj8.WriteInt(val.i, self.buf)
                 self.buf.write(str(val.i))
 
             elif case(value_e.Float):
                 val = cast(value.Float, UP_val)
 
-                # TODO: buf.write_float() would avoid allocation
+                # TODO: use pyj8.WriteFloat(val.f, self.buf)
                 self.buf.write(str(val.f))
 
             elif case(value_e.Str):
                 val = cast(value.Str, UP_val)
 
+                # TODO: pyj8.WriteString(val.s, self.buf)
                 self._StringToBuf(val.s)
 
             elif case(value_e.List):

diff --git a/data_lang/pyj8.py b/data_lang/pyj8.py
@@ -12,39 +12,30 @@
 _ = log
 
 
-def Replace(e):
-    # type: (UnicodeDecodeError) -> Tuple[unicode, int]
-
-    #print('ZZ', e)
-    #raise e
-    #print(dir(e))
-
-    #print('OBJ', e.object)
-
-    # Can we record these positions somewhere?
-
-    # A byte string can be alternating slices of valid unicode ranges an
-    # invalid unicode ranges?
-    # So we're doing recovery?
-
-    print('%d %d' % (e.start, e.end))
-
-    return (u'ZZ', e.end)
+def WriteInt(i, buf):
+    # type: (int, mylib.BufWriter) -> None
+    """
+    C++ version can avoid allocation
+    """
+    buf.write(str(i))
 
 
-# This allows us to return Unicode, not what we want
-# Replace() has a weird type
-# codecs.register_error('j8', Replace)  # type: ignore
+def WriteFloat(f, buf):
+    # type: (float, mylib.BufWriter) -> None
+    """
+    C++ version can avoid allocation
+    """
+    buf.write(str(f))
 
 
-def Enc(s, options):
+def EncodeString(s, options):
     # type: (str, int) -> str
     buf = mylib.BufWriter()
-    Encode(s, options, buf)
+    WriteString(s, options, buf)
     return buf.getvalue()
 
 
-def Encode(s, options, buf):
+def WriteString(s, options, buf):
     # type: (str, int, mylib.BufWriter) -> int
     """
     Callers:
@@ -88,7 +79,7 @@ def Encode(s, options, buf):
 
        J8 mode:
          Prefer literal UTF-8
-         Escaping mode to use j"\u{123456}" and perhaps b"\u{123456} when there
+         Escaping mode to use j"\\u{123456}" and perhaps b"\\u{123456} when there
          are also errors
 
        = mode:

diff --git a/data_lang/pyj8_test.py b/data_lang/pyj8_test.py
@@ -12,26 +12,26 @@
 class J8Test(unittest.TestCase):
 
     def testEncode(self):
-        en = pyj8.Enc('hello', 0)
+        en = pyj8.EncodeString('hello', 0)
         print(en)
 
-        en = pyj8.Enc('\xff-\xfe-\xff-\xfe', 0)
+        en = pyj8.EncodeString('\xff-\xfe-\xff-\xfe', 0)
         print(en)
 
         # multiple errrors
-        en = pyj8.Enc('hello\xffthere \xfe\xff gah', 0)
+        en = pyj8.EncodeString('hello\xffthere \xfe\xff gah', 0)
         print(en)
 
         # valid mu
-        en = pyj8.Enc('hello \xce\xbc there', 0)
+        en = pyj8.EncodeString('hello \xce\xbc there', 0)
         print(en)
 
         # two first bytes - invalid
-        en = pyj8.Enc('hello \xce\xce there', 0)
+        en = pyj8.EncodeString('hello \xce\xce there', 0)
         print(en)
 
         # two cont bytes - invalid
-        en = pyj8.Enc('hello \xbc\xbc there', 0)
+        en = pyj8.EncodeString('hello \xbc\xbc there', 0)
         print(en)