Skip to content

Commit

Permalink
[word_parse refactor] Prepare for J8 multi-line strings
Browse files Browse the repository at this point in the history
u''' and b'''

And r''' should be handled with the same function.
  • Loading branch information
Andy C committed Jan 2, 2024
1 parent 590cb4e commit ab32747
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 42 deletions.
2 changes: 0 additions & 2 deletions frontend/id_kind_def.py
Expand Up @@ -434,7 +434,6 @@ def AddKinds(spec):
'DoubleQuote',
'SingleQuote', # ''
'DollarSingleQuote', # $'' for \n escapes

'RSingleQuote', # r''
'USingleQuote', # u''
'BSingleQuote', # b''
Expand All @@ -445,7 +444,6 @@ def AddKinds(spec):
'RTSingleQuote', # r''' '''
'UTSingleQuote', # u''' '''
'BTSingleQuote', # b''' '''

'DollarTSingleQuote', # $''' '''
'Backtick', # `
'DollarParen', # $(
Expand Down
2 changes: 0 additions & 2 deletions frontend/lexer_def.py
Expand Up @@ -806,15 +806,13 @@ def R(pat, tok_type):
C("u'", Id.Left_USingleQuote),
C("b'", Id.Left_BSingleQuote),
C("$'", Id.Left_DollarSingleQuote), # TODO: remove

C('"""', Id.Left_TDoubleQuote),
# In expression mode, we add the r'' and c'' prefixes for '' and $''.
C("'''", Id.Left_TSingleQuote),
C("r'''", Id.Left_RTSingleQuote),
C("u'''", Id.Left_UTSingleQuote),
C("b'''", Id.Left_BTSingleQuote),
C("$'''", Id.Left_DollarTSingleQuote), # TODO: remove from YSH expressions

C('@(', Id.Left_AtParen), # Split Command Sub
C('^(', Id.Left_CaretParen), # Block literals in expression mode
C('^[', Id.Left_CaretBracket), # Expr literals
Expand Down
106 changes: 68 additions & 38 deletions osh/word_parse.py
Expand Up @@ -619,12 +619,12 @@ def _ReadSingleQuoted(self, left_token, lex_mode):
node = SingleQuoted(left_token, tokens, right_quote)
return node

def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
# type: (lex_mode_t, Token, List[Token], bool) -> Token
"""Used by expr_parse.py."""

# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
no_backslashes = is_oil_expr and left_token.id == Id.Left_SingleQuote
no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote

expected_end_tokens = 3 if left_token.id in (
Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
Expand All @@ -645,7 +645,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
r"Strings with backslashes should look like r'\n' or $'\n'",
tok)

if is_oil_expr:
if is_ysh_expr:
if self.token_type == Id.Char_Octal3:
p_die(
r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
Expand All @@ -662,7 +662,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
elif self.token_kind == Kind.Unknown:
tok = self.cur_token
# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
if is_oil_expr or not self.parse_opts.parse_backslash():
if is_ysh_expr or not self.parse_opts.parse_backslash():
p_die("Invalid char escape in C-style string literal", tok)

tokens.append(tok)
Expand Down Expand Up @@ -715,17 +715,44 @@ def _ReadDoubleQuotedLeftParts(self):

def _ReadYshSingleQuoted(self, left_id):
# type: (Id_t) -> CompoundWord
"""
left_letter: u or b
"""Read YSH style strings
r'' u'' b''
r''' ''' u''' ''' b''' '''
"""
#log('BEF self.cur_token %s', self.cur_token)
lexer_mode = (lex_mode_e.SQ_Raw if left_id == Id.Left_RSingleQuote else
lex_mode_e.J8_Str)
if left_id == Id.Left_RSingleQuote:
lexer_mode = lex_mode_e.SQ_Raw
triple_left_id = Id.Left_RTSingleQuote
elif left_id == Id.Left_USingleQuote:
lexer_mode = lex_mode_e.J8_Str
triple_left_id = Id.Left_UTSingleQuote
elif left_id == Id.Left_BSingleQuote:
lexer_mode = lex_mode_e.J8_Str
triple_left_id = Id.Left_BTSingleQuote
else:
raise AssertionError(left_id)

sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)

#log('AFT self.cur_token %s', self.cur_token)
if (len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
#log('yes')

self._SetNext(lex_mode_e.ShCommand)
self._GetToken()

assert self.token_type == Id.Left_SingleQuote
# HACK: magically transform the third ' in u''' to
# Id.Left_UTSingleQuote, so that ''' is the terminator
left_dq_token = self.cur_token
left_dq_token.id = triple_left_id
#triple_out.b = True # let caller know we got it
sq_part = self._ReadSingleQuoted(left_dq_token, lexer_mode)

sq_part.left.id = left_id
# TODO: read ending quotes
else:
#log('AFT self.cur_token %s', self.cur_token)
sq_part.left.id = left_id

# Advance and validate
self._SetNext(lex_mode_e.ShCommand)
Expand All @@ -748,16 +775,18 @@ def _ReadUnquotedLeftParts(self, triple_out):
# Note: $"" is a synonym for "". It might make sense if it added
# \n \0 \x00 \u{123} etc. But that's not what bash does!
dq_part = self._ReadDoubleQuoted(self.cur_token)
if triple_out and len(dq_part.parts) == 0: # read empty word ""
if self.lexer.ByteLookAhead() == '"':
self._SetNext(lex_mode_e.ShCommand)
self._GetToken()
# HACK: magically transform the third " in """ to
# Id.Left_TDoubleQuote, so that """ is the terminator
left_dq_token = self.cur_token
left_dq_token.id = Id.Left_TDoubleQuote
triple_out.b = True # let caller know we got it
return self._ReadDoubleQuoted(left_dq_token)
# Got empty word "" and there's a " after
if (triple_out and len(dq_part.parts) == 0 and
self.lexer.ByteLookAhead() == '"'):

self._SetNext(lex_mode_e.ShCommand)
self._GetToken()
# HACK: magically transform the third " in """ to
# Id.Left_TDoubleQuote, so that """ is the terminator
left_dq_token = self.cur_token
left_dq_token.id = Id.Left_TDoubleQuote
triple_out.b = True # let caller know we got it
return self._ReadDoubleQuoted(left_dq_token)

return dq_part

Expand All @@ -772,19 +801,19 @@ def _ReadUnquotedLeftParts(self, triple_out):
new_id = Id.Left_TSingleQuote

sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
if triple_out and len(sq_part.tokens) == 0:
# Read empty '' or r'' or $''. Now test if there's a triple
# quote.
if self.lexer.ByteLookAhead() == "'":
self._SetNext(lex_mode_e.ShCommand)
self._GetToken()
# Got empty '' or r'' or $'' and there's a ' after
if (triple_out and len(sq_part.tokens) == 0 and
self.lexer.ByteLookAhead() == "'"):

self._SetNext(lex_mode_e.ShCommand)
self._GetToken()

# HACK: magically transform the third ' in r''' to
# Id.Left_RTSingleQuote, so that ''' is the terminator
left_sq_token = self.cur_token
left_sq_token.id = new_id
triple_out.b = True # let caller know we got it
return self._ReadSingleQuoted(left_sq_token, lexer_mode)
# HACK: magically transform the third ' in r''' to
# Id.Left_RTSingleQuote, so that ''' is the terminator
left_sq_token = self.cur_token
left_sq_token.id = new_id
triple_out.b = True # let caller know we got it
return self._ReadSingleQuoted(left_sq_token, lexer_mode)

return sq_part

Expand Down Expand Up @@ -853,13 +882,13 @@ def _ReadExtGlob(self):

return word_part.ExtGlob(left_token, arms, right_token)

def _ReadLikeDQ(self, left_token, is_oil_expr, out_parts):
def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
# type: (Optional[Token], bool, List[word_part_t]) -> None
"""
Args:
left_token: A token if we are reading a double quoted part, or None if
we're reading a here doc.
is_oil_expr: Whether to disallow backticks and invalid char escapes
is_ysh_expr: Whether to disallow backticks and invalid char escapes
out_parts: list of word_part to append to
"""
if left_token:
Expand All @@ -884,21 +913,21 @@ def _ReadLikeDQ(self, left_token, is_oil_expr, out_parts):
# YSH.
# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
# recursion (unless parse_backslash)
if (is_oil_expr or
if (is_ysh_expr or
not self.parse_opts.parse_backslash()):
p_die(
"Invalid char escape in double quoted string",
self.cur_token)
elif self.token_type == Id.Lit_Dollar:
if is_oil_expr or not self.parse_opts.parse_dollar():
if is_ysh_expr or not self.parse_opts.parse_dollar():
p_die("Literal $ should be quoted like \$",
self.cur_token)

part = self.cur_token
out_parts.append(part)

elif self.token_kind == Kind.Left:
if self.token_type == Id.Left_Backtick and is_oil_expr:
if self.token_type == Id.Left_Backtick and is_ysh_expr:
p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
self.cur_token)

Expand Down Expand Up @@ -1924,7 +1953,8 @@ def _ReadWord(self, word_mode):
self._GetToken()
#assert self.token_type == Id.Left_SingleQuote, self.token_type

return self._ReadYshSingleQuoted(Id.Left_RSingleQuote)
return self._ReadYshSingleQuoted(
Id.Left_RSingleQuote)

# When shopt -s parse_j8_string
# echo u'\u{3bc}' b'\yff' works
Expand Down

0 comments on commit ab32747

Please sign in to comment.