[word_parse refactor] Prepare for J8 multi-line strings

u''' and b''' And r''' should be handled with the same function.
oilshell · Jan 2, 2024 · ab32747 · ab32747
1 parent 590cb4e
commit ab32747
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 42 deletions.
diff --git a/frontend/id_kind_def.py b/frontend/id_kind_def.py
@@ -434,7 +434,6 @@ def AddKinds(spec):
             'DoubleQuote',
             'SingleQuote',  # ''
             'DollarSingleQuote',  # $'' for \n escapes
-
             'RSingleQuote',  # r''
             'USingleQuote',  # u''
             'BSingleQuote',  # b''
@@ -445,7 +444,6 @@ def AddKinds(spec):
             'RTSingleQuote',  # r''' '''
             'UTSingleQuote',  # u''' '''
             'BTSingleQuote',  # b''' '''
-
             'DollarTSingleQuote',  # $''' '''
             'Backtick',  # `
             'DollarParen',  # $(

diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py
@@ -806,15 +806,13 @@ def R(pat, tok_type):
     C("u'", Id.Left_USingleQuote),
     C("b'", Id.Left_BSingleQuote),
     C("$'", Id.Left_DollarSingleQuote),  # TODO: remove
-
     C('"""', Id.Left_TDoubleQuote),
     # In expression mode, we add the r'' and c'' prefixes for '' and $''.
     C("'''", Id.Left_TSingleQuote),
     C("r'''", Id.Left_RTSingleQuote),
     C("u'''", Id.Left_UTSingleQuote),
     C("b'''", Id.Left_BTSingleQuote),
     C("$'''", Id.Left_DollarTSingleQuote),  # TODO: remove from YSH expressions
-
     C('@(', Id.Left_AtParen),  # Split Command Sub
     C('^(', Id.Left_CaretParen),  # Block literals in expression mode
     C('^[', Id.Left_CaretBracket),  # Expr literals

diff --git a/osh/word_parse.py b/osh/word_parse.py
@@ -619,12 +619,12 @@ def _ReadSingleQuoted(self, left_token, lex_mode):
         node = SingleQuoted(left_token, tokens, right_quote)
         return node
 
-    def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
+    def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
         # type: (lex_mode_t, Token, List[Token], bool) -> Token
         """Used by expr_parse.py."""
 
         # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
-        no_backslashes = is_oil_expr and left_token.id == Id.Left_SingleQuote
+        no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
 
         expected_end_tokens = 3 if left_token.id in (
             Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
@@ -645,7 +645,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
                         r"Strings with backslashes should look like r'\n' or $'\n'",
                         tok)
 
-                if is_oil_expr:
+                if is_ysh_expr:
                     if self.token_type == Id.Char_Octal3:
                         p_die(
                             r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
@@ -662,7 +662,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_oil_expr):
             elif self.token_kind == Kind.Unknown:
                 tok = self.cur_token
                 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
-                if is_oil_expr or not self.parse_opts.parse_backslash():
+                if is_ysh_expr or not self.parse_opts.parse_backslash():
                     p_die("Invalid char escape in C-style string literal", tok)
 
                 tokens.append(tok)
@@ -715,17 +715,44 @@ def _ReadDoubleQuotedLeftParts(self):
 
     def _ReadYshSingleQuoted(self, left_id):
         # type: (Id_t) -> CompoundWord
-        """
-        left_letter: u or b
+        """Read YSH style strings
+
+        r''        u''        b''
+        r''' '''   u''' '''   b''' '''
         """
         #log('BEF self.cur_token %s', self.cur_token)
-        lexer_mode = (lex_mode_e.SQ_Raw if left_id == Id.Left_RSingleQuote else
-                      lex_mode_e.J8_Str)
+        if left_id == Id.Left_RSingleQuote:
+            lexer_mode = lex_mode_e.SQ_Raw 
+            triple_left_id = Id.Left_RTSingleQuote
+        elif left_id == Id.Left_USingleQuote:
+            lexer_mode = lex_mode_e.J8_Str
+            triple_left_id = Id.Left_UTSingleQuote
+        elif left_id == Id.Left_BSingleQuote:
+            lexer_mode = lex_mode_e.J8_Str
+            triple_left_id = Id.Left_BTSingleQuote
+        else:
+            raise AssertionError(left_id)
+
         sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
 
-        #log('AFT self.cur_token %s', self.cur_token)
+        if (len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
+            #log('yes')
+
+            self._SetNext(lex_mode_e.ShCommand)
+            self._GetToken()
+
+            assert self.token_type == Id.Left_SingleQuote
+            # HACK: magically transform the third ' in u''' to
+            # Id.Left_UTSingleQuote, so that ''' is the terminator
+            left_dq_token = self.cur_token
+            left_dq_token.id = triple_left_id
+            #triple_out.b = True  # let caller know we got it
+            sq_part = self._ReadSingleQuoted(left_dq_token, lexer_mode)
 
-        sq_part.left.id = left_id
+            # TODO: read ending quotes
+        else:
+            #log('AFT self.cur_token %s', self.cur_token)
+            sq_part.left.id = left_id
 
         # Advance and validate
         self._SetNext(lex_mode_e.ShCommand)
@@ -748,16 +775,18 @@ def _ReadUnquotedLeftParts(self, triple_out):
             # Note: $"" is a synonym for "".  It might make sense if it added
             # \n \0 \x00 \u{123} etc.  But that's not what bash does!
             dq_part = self._ReadDoubleQuoted(self.cur_token)
-            if triple_out and len(dq_part.parts) == 0:  # read empty word ""
-                if self.lexer.ByteLookAhead() == '"':
-                    self._SetNext(lex_mode_e.ShCommand)
-                    self._GetToken()
-                    # HACK: magically transform the third " in """ to
-                    # Id.Left_TDoubleQuote, so that """ is the terminator
-                    left_dq_token = self.cur_token
-                    left_dq_token.id = Id.Left_TDoubleQuote
-                    triple_out.b = True  # let caller know we got it
-                    return self._ReadDoubleQuoted(left_dq_token)
+            # Got empty word "" and there's a " after
+            if (triple_out and len(dq_part.parts) == 0 and
+                    self.lexer.ByteLookAhead() == '"'):
+
+                self._SetNext(lex_mode_e.ShCommand)
+                self._GetToken()
+                # HACK: magically transform the third " in """ to
+                # Id.Left_TDoubleQuote, so that """ is the terminator
+                left_dq_token = self.cur_token
+                left_dq_token.id = Id.Left_TDoubleQuote
+                triple_out.b = True  # let caller know we got it
+                return self._ReadDoubleQuoted(left_dq_token)
 
             return dq_part
 
@@ -772,19 +801,19 @@ def _ReadUnquotedLeftParts(self, triple_out):
                 new_id = Id.Left_TSingleQuote
 
             sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
-            if triple_out and len(sq_part.tokens) == 0:
-                # Read empty '' or r'' or $''.  Now test if there's a triple
-                # quote.
-                if self.lexer.ByteLookAhead() == "'":
-                    self._SetNext(lex_mode_e.ShCommand)
-                    self._GetToken()
+            # Got empty '' or r'' or $'' and there's a ' after
+            if (triple_out and len(sq_part.tokens) == 0 and
+                    self.lexer.ByteLookAhead() == "'"):
+
+                self._SetNext(lex_mode_e.ShCommand)
+                self._GetToken()
 
-                    # HACK: magically transform the third ' in r''' to
-                    # Id.Left_RTSingleQuote, so that ''' is the terminator
-                    left_sq_token = self.cur_token
-                    left_sq_token.id = new_id
-                    triple_out.b = True  # let caller know we got it
-                    return self._ReadSingleQuoted(left_sq_token, lexer_mode)
+                # HACK: magically transform the third ' in r''' to
+                # Id.Left_RTSingleQuote, so that ''' is the terminator
+                left_sq_token = self.cur_token
+                left_sq_token.id = new_id
+                triple_out.b = True  # let caller know we got it
+                return self._ReadSingleQuoted(left_sq_token, lexer_mode)
 
             return sq_part
 
@@ -853,13 +882,13 @@ def _ReadExtGlob(self):
 
         return word_part.ExtGlob(left_token, arms, right_token)
 
-    def _ReadLikeDQ(self, left_token, is_oil_expr, out_parts):
+    def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
         # type: (Optional[Token], bool, List[word_part_t]) -> None
         """
         Args:
           left_token: A token if we are reading a double quoted part, or None if
             we're reading a here doc.
-          is_oil_expr: Whether to disallow backticks and invalid char escapes
+          is_ysh_expr: Whether to disallow backticks and invalid char escapes
           out_parts: list of word_part to append to
         """
         if left_token:
@@ -884,21 +913,21 @@ def _ReadLikeDQ(self, left_token, is_oil_expr, out_parts):
                         # YSH.
                         # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
                         # recursion (unless parse_backslash)
-                        if (is_oil_expr or
+                        if (is_ysh_expr or
                                 not self.parse_opts.parse_backslash()):
                             p_die(
                                 "Invalid char escape in double quoted string",
                                 self.cur_token)
                     elif self.token_type == Id.Lit_Dollar:
-                        if is_oil_expr or not self.parse_opts.parse_dollar():
+                        if is_ysh_expr or not self.parse_opts.parse_dollar():
                             p_die("Literal $ should be quoted like \$",
                                   self.cur_token)
 
                     part = self.cur_token
                 out_parts.append(part)
 
             elif self.token_kind == Kind.Left:
-                if self.token_type == Id.Left_Backtick and is_oil_expr:
+                if self.token_type == Id.Left_Backtick and is_ysh_expr:
                     p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
                           self.cur_token)
 
@@ -1924,7 +1953,8 @@ def _ReadWord(self, word_mode):
                             self._GetToken()
                             #assert self.token_type == Id.Left_SingleQuote, self.token_type
 
-                            return self._ReadYshSingleQuoted(Id.Left_RSingleQuote)
+                            return self._ReadYshSingleQuoted(
+                                Id.Left_RSingleQuote)
 
                     # When shopt -s parse_j8_string
                     #     echo u'\u{3bc}' b'\yff' works