[j8] Strings use single quotes: u'' and b''

Not u"" and b"" Some reasons for this - It opens up '' as a synonym for u'' (could be used in TSV8, if not JSON8) - In YSH, we have r'' u'' b'' that don't support interpolation - while "$x" supports interpolation - JavaScript and Python both support '' in code, so it's not too unfamiliar - although note we're not compatible, e.g. because we use \u{123456} Aesthetic: - b'' is less noisy than b"" - you could argue that u'' is more distinct from JSON "" than u"" is
oilshell · Jan 4, 2024 · 5ac3494 · 5ac3494
1 parent 82af762
commit 5ac3494
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 34 deletions.
diff --git a/data_lang/pyj8.py b/data_lang/pyj8.py
@@ -34,9 +34,10 @@ def EncodeString(s, options):
 
 # similar to frontend/consts.py
 _JSON_ESCAPES = {
-    # Note: we don't escaping \/
+    # Notes:
+    # - we don't escape \/
+    # - \' and \" are decided dynamically, based on the quote
     '\\': '\\\\',
-    '"': '\\"',
     '\b': '\\b',
     '\f': '\\f',
     '\n': '\\n',
@@ -45,7 +46,7 @@ def EncodeString(s, options):
 }
 
 
-def _EscapeUnprintable(s, buf, u6_escapes=False):
+def _EscapeUnprintable(s, buf, is_j8=False):
     # type: (str, mylib.BufWriter, bool) -> None
     """ Print a string literal with required esceapes like \\n
 
@@ -58,11 +59,17 @@ def _EscapeUnprintable(s, buf, u6_escapes=False):
             buf.write(escaped)
             continue
 
+        if ch == "'" and is_j8:
+            buf.write(r"\'")
+            continue
+
+        if ch == '"' and not is_j8:
+            buf.write(r'\"')
+            continue
+
         char_code = ord(ch)
         if char_code < 0x20:  # like IsUnprintableLow
-            # TODO: mylib.hex_lower doesn't have padding
-            #buf.write(r'\u%04d' % char_code)
-            if u6_escapes:
+            if is_j8:
                 buf.write(r'\u{%x}' % char_code)
             else:
                 buf.write(r'\u%04x' % char_code)
@@ -144,7 +151,7 @@ def WriteString(s, options, buf):
         buf.write('b"')
         pos = 0
         for start, end in invalid_utf8:
-            _EscapeUnprintable(s[pos:start], buf, u6_escapes=True)
+            _EscapeUnprintable(s[pos:start], buf, is_j8=True)
 
             for i in xrange(start, end):
                 buf.write('\y%x' % ord(s[i]))
@@ -153,7 +160,7 @@ def WriteString(s, options, buf):
             #log('pos %d', pos)
 
         # Last part
-        _EscapeUnprintable(s[pos:], buf, u6_escapes=True)
+        _EscapeUnprintable(s[pos:], buf, is_j8=True)
         buf.write('"')
 
     else:
@@ -201,23 +208,27 @@ def _Error(self, msg, end_pos):
     def Next(self):
         # type: () -> Tuple[Id_t, int, Optional[str]]
 
-        # TODO: break dep
-        from osh import string_ops
-
         while True:  # ignore spaces
             tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
             if tok_id != Id.Ignored_Space:
                 break
             self.pos = end_pos
 
-        # TODO: Distinguish bewteen "" b"" and u"", and allow different
+        # TODO: Distinguish bewteen "" b'' and u'', and allow different
         # escapes.
         if tok_id not in (Id.Left_DoubleQuote, Id.Left_USingleQuote,
                           Id.Left_BSingleQuote):
             self.pos = end_pos
             return tok_id, end_pos, None
 
-        str_pos = end_pos
+        return self._DecodeString(tok_id, end_pos)
+
+    def _DecodeString(self, left_id, str_pos):
+        # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
+
+        # TODO: break dep
+        from osh import string_ops
+
         while True:
             tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
 
@@ -227,13 +238,17 @@ def Next(self):
                                   str_end)
             if tok_id == Id.Unknown_Tok:
                 # e.g. invalid backslash
-                raise self._Error(
-                    'Unknown token while lexing JSON string', str_end)
+                raise self._Error('Unknown token while lexing JSON string',
+                                  str_end)
             if tok_id == Id.Char_AsciiControl:
                 raise self._Error(
                     "ASCII control chars are illegal in JSON strings", str_end)
 
-            if tok_id == Id.Right_DoubleQuote:
+            # yapf: disable
+            if (left_id == Id.Left_DoubleQuote and tok_id == Id.Right_DoubleQuote or
+                left_id != Id.Left_DoubleQuote and tok_id == Id.Right_SingleQuote):
+                # yapf: enable
+
                 self.pos = str_end
 
                 s = self.decoded.getvalue()
@@ -255,7 +270,16 @@ def Next(self):
             # Now handle each kind of token
             #
 
-            if tok_id == Id.Char_Literals:  # JSON and J8
+            # "'" and u'"' are OK unescaped
+            # yapf: disable
+            if (left_id == Id.Left_DoubleQuote and tok_id == Id.Right_SingleQuote or
+                left_id != Id.Left_DoubleQuote and tok_id == Id.Right_DoubleQuote):
+                # yapf: enable
+
+                assert str_end == str_pos + 1, (str_pos, str_end)
+                part = self.s[str_pos]
+
+            elif tok_id == Id.Char_Literals:  # JSON and J8
                 part = self.s[str_pos:str_end]
                 try:
                     part.decode('utf-8')

diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py
@@ -526,9 +526,8 @@ def R(pat, tok_type):
 
 J8_DEF = [
     C('"', Id.Left_DoubleQuote),  # JSON string
-    # TODO: change to single quote
-    C('u"', Id.Left_USingleQuote),  # unicode string
-    C('b"', Id.Left_BSingleQuote),  # byte string
+    C("u'", Id.Left_USingleQuote),  # unicode string
+    C("b'", Id.Left_BSingleQuote),  # byte string
     C('[', Id.J8_LBracket),
     C(']', Id.J8_RBracket),
     C('{', Id.J8_LBrace),
@@ -553,9 +552,8 @@ def R(pat, tok_type):
 
 # Union of escapes that "" u"" b"" accept.  Validation is separate.
 J8_STR_DEF = [
-    # TODO: remove double quote
-    C('"', Id.Right_DoubleQuote),
-    C("'", Id.Right_SingleQuote),
+    C('"', Id.Right_DoubleQuote),  # end for JSON
+    C("'", Id.Right_SingleQuote),  # end for J8
 
     # https://json.org list of chars
     R(r'\\["\\/bfnrt]', Id.Char_OneChar),

diff --git a/osh/word_parse.py b/osh/word_parse.py
@@ -627,8 +627,8 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
         no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
 
         expected_end_tokens = 3 if left_token.id in (
-            Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
-            Id.Left_UTSingleQuote, Id.Left_BTSingleQuote) else 1
+            Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
+            Id.Left_BTSingleQuote) else 1
         num_end_tokens = 0
 
         while num_end_tokens < expected_end_tokens:
@@ -724,7 +724,7 @@ def _ReadYshSingleQuoted(self, left_id):
         """
         #log('BEF self.cur_token %s', self.cur_token)
         if left_id == Id.Left_RSingleQuote:
-            lexer_mode = lex_mode_e.SQ_Raw 
+            lexer_mode = lex_mode_e.SQ_Raw
             triple_left_id = Id.Left_RTSingleQuote
         elif left_id == Id.Left_USingleQuote:
             lexer_mode = lex_mode_e.J8_Str
@@ -809,8 +809,9 @@ def _ReadUnquotedLeftParts(self, triple_out):
 
             # Got empty '' or r'' and there's a ' after
             # u'' and b'' are handled in _ReadYshSingleQuoted
-            if (triple_out and triple_left_id != Id.Undefined_Tok and
-                len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
+            if (triple_left_id != Id.Undefined_Tok and triple_out and
+                    len(sq_part.tokens) == 0 and
+                    self.lexer.ByteLookAhead() == "'"):
 
                 self._SetNext(lex_mode_e.ShCommand)
                 self._GetToken()
@@ -1957,9 +1958,9 @@ def _ReadWord(self, word_mode):
                             self.cur_token.tval in ('r', 'u', 'b')):
 
                         if self.cur_token.tval == 'r':
-                            left_id = Id.Left_RSingleQuote 
+                            left_id = Id.Left_RSingleQuote
                         elif self.cur_token.tval == 'u':
-                            left_id = Id.Left_USingleQuote 
+                            left_id = Id.Left_USingleQuote
                         else:
                             left_id = Id.Left_BSingleQuote
 

diff --git a/spec/testdata/j8-read.sh b/spec/testdata/j8-read.sh
@@ -19,7 +19,7 @@ pp line (_reply)
 echo '{"k": 1, "k2": 2}' | j8 read
 pp line (_reply)
 
-echo '{u"k": {b"k2": null}}' | j8 read
+echo "{u'k': {b'k2': null}}" | j8 read
 pp line (_reply)
 
 echo '{"k": {"k2": "v2"}, "k3": "backslash \\ \" \n line 2 \u03bc "}' | j8 read

diff --git a/spec/ysh-json.test.sh b/spec/ysh-json.test.sh
@@ -440,3 +440,24 @@ status=4
 ASCII control chars
 ## END
 
+
+#### JSON string can have unescaped ' and J8 string can have unescaped "
+
+json read <<EOF
+"'"
+EOF
+
+pp line (_reply)
+
+
+
+j8 read <<EOF
+u'"'
+EOF
+
+pp line (_reply)
+
+## STDOUT:
+(Str)   "'"
+(Str)   "\""
+## END
diff --git a/test/ysh-every-string.sh b/test/ysh-every-string.sh
@@ -37,6 +37,7 @@ EOF
 test-legacy-expr() {
   for sh in $YSH; do
     $sh <<'EOF'
+  # can't have backslash without r, and can't have single quote
   var x = 'foo "
 ---'
   echo $x

diff --git a/ysh/expr_parse.py b/ysh/expr_parse.py
@@ -289,14 +289,14 @@ def _PushOilTokens(parse_ctx, gr, p, lex, tea_keywords):
 
         # 'x'  '''x'''
         # r'x'  r'''x'''
-        # $'x'
         # u'x'  u'''x'''
         # b'x'  b'''x'''
+        # $'x'
         if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
                       Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
-                      Id.Left_DollarSingleQuote,
                       Id.Left_USingleQuote, Id.Left_UTSingleQuote,
-                      Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
+                      Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
+                      Id.Left_DollarSingleQuote):
             if tok.id == Id.Left_DollarSingleQuote:
                 sq_mode = lex_mode_e.SQ_C
             elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,