Skip to content

Commit

Permalink
[j8] Strings use single quotes: u'' and b''
Browse files Browse the repository at this point in the history
Not u"" and b""

Some reasons for this

- It opens up '' as a synonym for u'' (could be used in TSV8, if not
  JSON8)
- In YSH, we have r'' u'' b'' that don't support interpolation
  - while "$x" supports interpolation
- JavaScript and Python both support '' in code, so it's not too
  unfamiliar
  - although note we're not compatible, e.g. because we use \u{123456}

Aesthetic:

- b'' is less noisy than b""
- you could argue that u'' is more distinct from JSON "" than u"" is
  • Loading branch information
Andy C committed Jan 4, 2024
1 parent 82af762 commit 5ac3494
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 34 deletions.
58 changes: 41 additions & 17 deletions data_lang/pyj8.py
Expand Up @@ -34,9 +34,10 @@ def EncodeString(s, options):

# similar to frontend/consts.py
_JSON_ESCAPES = {
# Note: we don't escaping \/
# Notes:
# - we don't escape \/
# - \' and \" are decided dynamically, based on the quote
'\\': '\\\\',
'"': '\\"',
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
Expand All @@ -45,7 +46,7 @@ def EncodeString(s, options):
}


def _EscapeUnprintable(s, buf, u6_escapes=False):
def _EscapeUnprintable(s, buf, is_j8=False):
# type: (str, mylib.BufWriter, bool) -> None
""" Print a string literal with required esceapes like \\n
Expand All @@ -58,11 +59,17 @@ def _EscapeUnprintable(s, buf, u6_escapes=False):
buf.write(escaped)
continue

if ch == "'" and is_j8:
buf.write(r"\'")
continue

if ch == '"' and not is_j8:
buf.write(r'\"')
continue

char_code = ord(ch)
if char_code < 0x20: # like IsUnprintableLow
# TODO: mylib.hex_lower doesn't have padding
#buf.write(r'\u%04d' % char_code)
if u6_escapes:
if is_j8:
buf.write(r'\u{%x}' % char_code)
else:
buf.write(r'\u%04x' % char_code)
Expand Down Expand Up @@ -144,7 +151,7 @@ def WriteString(s, options, buf):
buf.write('b"')
pos = 0
for start, end in invalid_utf8:
_EscapeUnprintable(s[pos:start], buf, u6_escapes=True)
_EscapeUnprintable(s[pos:start], buf, is_j8=True)

for i in xrange(start, end):
buf.write('\y%x' % ord(s[i]))
Expand All @@ -153,7 +160,7 @@ def WriteString(s, options, buf):
#log('pos %d', pos)

# Last part
_EscapeUnprintable(s[pos:], buf, u6_escapes=True)
_EscapeUnprintable(s[pos:], buf, is_j8=True)
buf.write('"')

else:
Expand Down Expand Up @@ -201,23 +208,27 @@ def _Error(self, msg, end_pos):
def Next(self):
# type: () -> Tuple[Id_t, int, Optional[str]]

# TODO: break dep
from osh import string_ops

while True: # ignore spaces
tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
if tok_id != Id.Ignored_Space:
break
self.pos = end_pos

# TODO: Distinguish bewteen "" b"" and u"", and allow different
# TODO: Distinguish bewteen "" b'' and u'', and allow different
# escapes.
if tok_id not in (Id.Left_DoubleQuote, Id.Left_USingleQuote,
Id.Left_BSingleQuote):
self.pos = end_pos
return tok_id, end_pos, None

str_pos = end_pos
return self._DecodeString(tok_id, end_pos)

def _DecodeString(self, left_id, str_pos):
# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]

# TODO: break dep
from osh import string_ops

while True:
tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)

Expand All @@ -227,13 +238,17 @@ def Next(self):
str_end)
if tok_id == Id.Unknown_Tok:
# e.g. invalid backslash
raise self._Error(
'Unknown token while lexing JSON string', str_end)
raise self._Error('Unknown token while lexing JSON string',
str_end)
if tok_id == Id.Char_AsciiControl:
raise self._Error(
"ASCII control chars are illegal in JSON strings", str_end)

if tok_id == Id.Right_DoubleQuote:
# yapf: disable
if (left_id == Id.Left_DoubleQuote and tok_id == Id.Right_DoubleQuote or
left_id != Id.Left_DoubleQuote and tok_id == Id.Right_SingleQuote):
# yapf: enable

self.pos = str_end

s = self.decoded.getvalue()
Expand All @@ -255,7 +270,16 @@ def Next(self):
# Now handle each kind of token
#

if tok_id == Id.Char_Literals: # JSON and J8
# "'" and u'"' are OK unescaped
# yapf: disable
if (left_id == Id.Left_DoubleQuote and tok_id == Id.Right_SingleQuote or
left_id != Id.Left_DoubleQuote and tok_id == Id.Right_DoubleQuote):
# yapf: enable

assert str_end == str_pos + 1, (str_pos, str_end)
part = self.s[str_pos]

elif tok_id == Id.Char_Literals: # JSON and J8
part = self.s[str_pos:str_end]
try:
part.decode('utf-8')
Expand Down
10 changes: 4 additions & 6 deletions frontend/lexer_def.py
Expand Up @@ -526,9 +526,8 @@ def R(pat, tok_type):

J8_DEF = [
C('"', Id.Left_DoubleQuote), # JSON string
# TODO: change to single quote
C('u"', Id.Left_USingleQuote), # unicode string
C('b"', Id.Left_BSingleQuote), # byte string
C("u'", Id.Left_USingleQuote), # unicode string
C("b'", Id.Left_BSingleQuote), # byte string
C('[', Id.J8_LBracket),
C(']', Id.J8_RBracket),
C('{', Id.J8_LBrace),
Expand All @@ -553,9 +552,8 @@ def R(pat, tok_type):

# Union of escapes that "" u"" b"" accept. Validation is separate.
J8_STR_DEF = [
# TODO: remove double quote
C('"', Id.Right_DoubleQuote),
C("'", Id.Right_SingleQuote),
C('"', Id.Right_DoubleQuote), # end for JSON
C("'", Id.Right_SingleQuote), # end for J8

# https://json.org list of chars
R(r'\\["\\/bfnrt]', Id.Char_OneChar),
Expand Down
15 changes: 8 additions & 7 deletions osh/word_parse.py
Expand Up @@ -627,8 +627,8 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote

expected_end_tokens = 3 if left_token.id in (
Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
Id.Left_UTSingleQuote, Id.Left_BTSingleQuote) else 1
Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
Id.Left_BTSingleQuote) else 1
num_end_tokens = 0

while num_end_tokens < expected_end_tokens:
Expand Down Expand Up @@ -724,7 +724,7 @@ def _ReadYshSingleQuoted(self, left_id):
"""
#log('BEF self.cur_token %s', self.cur_token)
if left_id == Id.Left_RSingleQuote:
lexer_mode = lex_mode_e.SQ_Raw
lexer_mode = lex_mode_e.SQ_Raw
triple_left_id = Id.Left_RTSingleQuote
elif left_id == Id.Left_USingleQuote:
lexer_mode = lex_mode_e.J8_Str
Expand Down Expand Up @@ -809,8 +809,9 @@ def _ReadUnquotedLeftParts(self, triple_out):

# Got empty '' or r'' and there's a ' after
# u'' and b'' are handled in _ReadYshSingleQuoted
if (triple_out and triple_left_id != Id.Undefined_Tok and
len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
if (triple_left_id != Id.Undefined_Tok and triple_out and
len(sq_part.tokens) == 0 and
self.lexer.ByteLookAhead() == "'"):

self._SetNext(lex_mode_e.ShCommand)
self._GetToken()
Expand Down Expand Up @@ -1957,9 +1958,9 @@ def _ReadWord(self, word_mode):
self.cur_token.tval in ('r', 'u', 'b')):

if self.cur_token.tval == 'r':
left_id = Id.Left_RSingleQuote
left_id = Id.Left_RSingleQuote
elif self.cur_token.tval == 'u':
left_id = Id.Left_USingleQuote
left_id = Id.Left_USingleQuote
else:
left_id = Id.Left_BSingleQuote

Expand Down
2 changes: 1 addition & 1 deletion spec/testdata/j8-read.sh
Expand Up @@ -19,7 +19,7 @@ pp line (_reply)
echo '{"k": 1, "k2": 2}' | j8 read
pp line (_reply)

echo '{u"k": {b"k2": null}}' | j8 read
echo "{u'k': {b'k2': null}}" | j8 read
pp line (_reply)

echo '{"k": {"k2": "v2"}, "k3": "backslash \\ \" \n line 2 \u03bc "}' | j8 read
Expand Down
21 changes: 21 additions & 0 deletions spec/ysh-json.test.sh
Expand Up @@ -440,3 +440,24 @@ status=4
ASCII control chars
## END


#### JSON string can have unescaped ' and J8 string can have unescaped "

json read <<EOF
"'"
EOF

pp line (_reply)



j8 read <<EOF
u'"'
EOF

pp line (_reply)

## STDOUT:
(Str) "'"
(Str) "\""
## END
1 change: 1 addition & 0 deletions test/ysh-every-string.sh
Expand Up @@ -37,6 +37,7 @@ EOF
test-legacy-expr() {
for sh in $YSH; do
$sh <<'EOF'
# can't have backslash without r, and can't have single quote
var x = 'foo "
---'
echo $x
Expand Down
6 changes: 3 additions & 3 deletions ysh/expr_parse.py
Expand Up @@ -289,14 +289,14 @@ def _PushOilTokens(parse_ctx, gr, p, lex, tea_keywords):

# 'x' '''x'''
# r'x' r'''x'''
# $'x'
# u'x' u'''x'''
# b'x' b'''x'''
# $'x'
if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
Id.Left_DollarSingleQuote,
Id.Left_USingleQuote, Id.Left_UTSingleQuote,
Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
Id.Left_DollarSingleQuote):
if tok.id == Id.Left_DollarSingleQuote:
sq_mode = lex_mode_e.SQ_C
elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
Expand Down

0 comments on commit 5ac3494

Please sign in to comment.