[j8] Escape invalid utf-8 with \yff

In Python, we can do it with s.decode('utf-8') In C++, we can do it with the Bjoern DFA.
oilshell · Dec 25, 2023 · 30e3316 · 30e3316
1 parent 5705c5f
commit 30e3316
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 27 deletions.
diff --git a/asdl/format.py b/asdl/format.py
@@ -25,8 +25,8 @@
 
 from typing import cast, Any, Optional
 
-
 if mylib.PYTHON:
+
     def PrettyPrint(obj, f=None):
         # type: (Any, Optional[mylib.Writer]) -> None
         """Print abbreviated tree in color, for unnit tests."""

diff --git a/data_lang/NINJA_subgraph.py b/data_lang/NINJA_subgraph.py
@@ -7,6 +7,7 @@
 from build import ninja_lib
 from build.ninja_lib import log
 
+
 def NinjaGraph(ru):
     n = ru.n
 
@@ -22,4 +23,3 @@ def NinjaGraph(ru):
         deps=['//data_lang/utf8_impls/utf8_decode'],
         # Add tcmalloc for malloc_address_test
         matrix=ninja_lib.COMPILERS_VARIANTS + [('cxx', 'tcmalloc')])
-
diff --git a/data_lang/j8_str.py b/data_lang/j8_str.py
@@ -4,64 +4,136 @@
 """
 
 from mycpp import mylib
+from mycpp.mylib import log
 
+from typing import Tuple, List
 
-def Encode(s, mode, buf):
+_ = log
+
+
+def Replace(e):
+    # type: (UnicodeDecodeError) -> Tuple[unicode, int]
+
+    #print('ZZ', e)
+    #raise e
+    #print(dir(e))
+
+    #print('OBJ', e.object)
+
+    # Can we record these positions somewhere?
+
+    # A byte string can be alternating slices of valid unicode ranges an
+    # invalid unicode ranges?
+    # So we're doing recovery?
+
+    print('%d %d' % (e.start, e.end))
+
+    return (u'ZZ', e.end)
+
+
+# This allows us to return Unicode, not what we want
+# Replace() has a weird type
+# codecs.register_error('j8', Replace)  # type: ignore
+
+
+def Enc(s, options):
+    # type: (str, int) -> str
+    buf = mylib.BufWriter()
+    Encode(s, options, buf)
+    return buf.getvalue()
+
+
+def Encode(s, options, buf):
     # type: (str, int, mylib.BufWriter) -> int
     """
     Callers:
 
     - json write
     - j8 write
     - the = operator
+    - pp line (x)
     - 'declare' prints in bash compatible syntax
 
-    Algorithm:
+    Simple algorithm:
 
-    1. Decode UTF-8 rune-by-rune, with 4 cases
-       - 1 byte
-       - 2 bytes
-       - 3 bytes - is this the surrrogate one?
-       - 4 bytes
+    1. Decode UTF-8 
+       In Python, use built-in s.decode('utf-8')
+       In C++, use Bjoern DFA
 
-    While detecting all these errors:
+    List of errors in UTF-8:
        - Invalid start byte
        - Invalid continuation byte
        - Incomplete UTF-8 char
        - Over-long UTF-8 encoding
        - Decodes to invalid code point (surrogate)
          - this changed in 2003; WTF-8 allows it
 
-    Error handling options:
-       JSON mode: Either
-       - errors are exceptions
-       - errors become Unicode Replacement Char
-       Option: unpaired surrogates like \\udc00 become errors, because errors
-       shouldn't travel over the wire
+    If decoding succeeds, then surround with "" 
+    - escape unprintable chars like \\u0001 and \\t \\n \\ \\"
 
-       J8 mode: No errors by definition
-       - All errors become \yff
+    If decoding fails (this includes unpaired surrogates like \\udc00)
+    - in J8 mode, all errors become \yff, and it must be a b"" string
+    - in JSON mode, based on options, either:
+      - use unicode replacement char (lossy)
+      - raise an exception, so the 'json dump' fails etc.
+        - Error can have location info
 
-    2. Encode in different modes
+    LATER: Options for encoding
 
        JSON mode:
          Prefer literal UTF-8
-         must use \\udc00 at times, so the overall message is valid UTF-8
+         Escaping mode: must use \\udc00 at times, so the overall message is
+           valid UTF-8
 
        J8 mode:
          Prefer literal UTF-8
-         All errors become \yff
-         Return a flag so you know to add the j"" prefix when using these.
-         Option to prefer \\u{123456}
+         Escaping mode to use j"\u{123456}" and perhaps b"\u{123456} when there
+         are also errors
 
        = mode:
          Option to prefer \\u{123456}
 
-       Shell mode:
-         Prefer literal UTF-8
-         Errors can be \\xff, not \yff
-         Option (low priority): use \\u1234 \\U00123456
+    Should we generate bash-compatible strings?
+       Like $'\\xff' for OSH
+       Option (low priority): use \\u1234 \\U00123456
     """
+    pos = 0
+    portion = s
+    invalid_utf8 = []  # type: List[Tuple[int, int]]
+    while True:
+        try:
+            portion.decode('utf-8')
+        except UnicodeDecodeError as e:
+            invalid_utf8.append((pos + e.start, pos + e.end))
+            pos += e.end
+        else:
+            break  # it validated
+        #log('== pos %d', pos)
+        portion = s[pos:]
+
+    #print('INVALID', invalid_utf8)
+    if len(invalid_utf8):
+        buf.write('b"')
+        pos = 0
+        for start, end in invalid_utf8:
+            buf.write(s[pos:start])
+
+            for i in xrange(start, end):
+                buf.write('\y%x' % ord(s[i]))
+
+            pos = end
+            #log('pos %d', pos)
+
+        buf.write(s[pos:])
+        buf.write('"')
+
+    else:
+        buf.write('"')
+        # TODO: escape \\ \" etc.
+        # could use str.maketrans or something?
+        buf.write(s)
+        buf.write('"')
+
     return 0
 
 
@@ -99,3 +171,6 @@ def py_decode(s):
     # TODO: Can use a regex as a demo
     # J8 strings are a regular language
     return s
+
+
+# vim: sw=4