[frontend refactor] Remove Id.Char_Literals in favor of Id.Lit_Chars

This will make whitespace stripping and the lossles variant more uniform.
oilshell · Mar 17, 2024 · 77b9f7e · 77b9f7e
1 parent 9458978
commit 77b9f7e
Show file tree

Hide file tree

Showing 10 changed files with 22 additions and 25 deletions.
diff --git a/builtin/printf_osh.py b/builtin/printf_osh.py
@@ -5,7 +5,7 @@
 import time as time_  # avoid name conflict
 
 from _devbuild.gen import arg_types
-from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t, Kind_t
+from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
 from _devbuild.gen.runtime_asdl import cmd_value
 from _devbuild.gen.syntax_asdl import (
     loc,
@@ -131,9 +131,8 @@ def Parse(self):
         self._Next(lex_mode_e.PrintfOuter)
         parts = []  # type: List[printf_part_t]
         while True:
-            if (self.token_kind == Kind.Char or
-                    self.token_type == Id.Format_EscapedPercent or
-                    self.token_type == Id.Unknown_Backslash):
+            if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
+                    in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
 
                 # Note: like in echo -e, we don't fail with Unknown_Backslash here
                 # when shopt -u parse_backslash because it's at runtime rather than
@@ -151,7 +150,7 @@ def Parse(self):
                 break
 
             else:
-                raise AssertionError(self.token_type)
+                raise AssertionError(Id_str(self.token_type))
 
             self._Next(lex_mode_e.PrintfOuter)
 

diff --git a/builtin/read_osh.py b/builtin/read_osh.py
@@ -304,7 +304,7 @@ def _MaybeDecodeLine(self, line):
             lexer = self.parse_ctx.MakeLexer(line_reader)
 
             # The parser only yields valid tokens:
-            #     Char_Literals, Char_OneChar, Char_Hex, Char_UBraced
+            #     Char_OneChar, Char_Hex, Char_UBraced
             # So we can use word_compile.EvalCStringToken, which is also used for
             # $''.
             # Important: we don't generate Id.Unknown_Backslash because that is valid

diff --git a/core/error.py b/core/error.py
@@ -230,7 +230,7 @@ def UserErrorString(self):
 class Decode(Exception):
     """
     List of J8 errors errors:
-    - message isn't UTF-8 - Id.Char_Literals - need loc
+    - message isn't UTF-8 - Id.Lit_Chars - need loc
     - Invalid token Id.Unkown_Tok - need loc
     - Unclosed double quote string -- need loc
     - Parse error, e.g. [}{]

diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -623,7 +623,7 @@ def _DecodeString(self, left_id, str_pos):
             # Now handle each kind of token
             #
 
-            if tok_id == Id.Char_Literals:  # JSON and J8
+            if tok_id == Id.Lit_Chars:  # JSON and J8
                 part = self.s[str_pos:str_end]
                 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
                     # Syntax error because JSON must be valid UTF-8

diff --git a/frontend/id_kind_def.py b/frontend/id_kind_def.py
@@ -400,7 +400,6 @@ def AddKinds(spec):
             'Unicode8',  # bash
             'UBraced',
             'Pound',  # YSH
-            'Literals',
             'AsciiControl',  # \x01-\x1f, what's disallowed in JSON
         ])
 

diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py
@@ -5,11 +5,11 @@
 
 After changing this file, run:
 
-    build/dev.sh all
+    build/py.sh all
 
 or at least:
 
-    build/dev.sh fastlex
+    build/py.sh fastlex
 
 Input Handling
 --------------
@@ -497,7 +497,7 @@ def R(pat, tok_type):
     C(r'\c', Id.Char_Stop),
 
     # e.g. 'foo', anything that's not a backslash escape
-    R(r'[^\\\0]+', Id.Char_Literals),
+    R(r'[^\\\0]+', Id.Lit_Chars),
 ]
 
 # https://json.org/
@@ -598,7 +598,7 @@ def R(pat, tok_type):
     _ASCII_CONTROL,
 
     # Note: This will match INVALID UTF-8.  UTF-8 validation is another step.
-    R(r'''[^\\'\0]+''', Id.Char_Literals),
+    R(r'''[^\\'\0]+''', Id.Lit_Chars),
 ]
 
 # For "JSON strings \" \u1234"
@@ -616,7 +616,7 @@ def R(pat, tok_type):
     _ASCII_CONTROL,
 
     # Note: This will match INVALID UTF-8.  UTF-8 validation is another step.
-    R(r'[^\\"\0]+', Id.Char_Literals),
+    R(r'[^\\"\0]+', Id.Lit_Chars),
     R(r'[^\0]', Id.Unknown_Tok),
 ]
 
@@ -654,13 +654,13 @@ def R(pat, tok_type):
     C(r'\"', Id.Char_OneChar),
 
     # e.g. 'foo', anything that's not a backslash escape or '
-    R(r"[^\\'\0]+", Id.Char_Literals),
+    R(r"[^\\'\0]+", Id.Lit_Chars),
     C("'", Id.Right_SingleQuote),
 ]
 
 LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
     R(OCTAL3_RE, Id.Char_Octal3),
-    R(r"[^%\\\0]+", Id.Char_Literals),
+    R(r"[^%\\\0]+", Id.Lit_Chars),
     C('%%', Id.Format_EscapedPercent),
     C('%', Id.Format_Percent),
 ]

diff --git a/frontend/lexer_def_test.py b/frontend/lexer_def_test.py
@@ -158,7 +158,7 @@ def testMode_DollarSq(self):
 
         t = lexer.Read(lex_mode_e.SQ_C)
         print(t)
-        self.assertTokensEqual(FakeTok(Id.Char_Literals, 'foo bar'), t)
+        self.assertTokensEqual(FakeTok(Id.Lit_Chars, 'foo bar'), t)
 
         t = lexer.Read(lex_mode_e.SQ_C)
         print(t)

diff --git a/osh/word_compile.py b/osh/word_compile.py
@@ -64,7 +64,7 @@ def EvalCStringToken(id_, value):
 
     $'' could use it at compile time, much like brace expansion in braces.py.
     """
-    if id_ in (Id.Char_Literals, Id.Unknown_Backslash, Id.Char_AsciiControl):
+    if id_ in (Id.Lit_Chars, Id.Unknown_Backslash, Id.Char_AsciiControl):
         # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
 
         # Char_AsciiControl is allowed in YSH code, for newlines in u''
@@ -246,8 +246,7 @@ def RemoveLeadingSpaceSQ(tokens):
     arena.
 
     Quirk to make more consistent:
-      In $''', we have Char_Literals \n
-      In r''' and ''', we have Lit_Chars \n
+      In $''' and r''' and ''', we have Lit_Chars \n
       In u''' and b''', we have Char_AsciiControl \n
     """
     if 0:
@@ -263,14 +262,14 @@ def RemoveLeadingSpaceSQ(tokens):
     #   x
     #   '''
     first = tokens[0]
-    if first.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
+    if first.id in (Id.Lit_Chars, Id.Char_AsciiControl):
         if _IsTrailingSpace(first):
             tokens.pop(0)  # Remove the first part
 
     # Figure out what to strip, based on last token
     last = tokens[-1]
     to_strip = None  # type: Optional[str]
-    if last.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
+    if last.id in (Id.Lit_Chars, Id.Char_AsciiControl):
         if _IsLeadingSpace(last):
             to_strip = lexer.TokenVal(last)
             tokens.pop()  # Remove the last part
@@ -288,7 +287,7 @@ def RemoveLeadingSpaceSQ(tokens):
         if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
             tok.col = n
             tok.length -= n
+            # TODO:
             # Lit_Chars -> Lit_CharsWithoutPrefix
-            # Char_Literals -> Char_LitStripped
             #
             #log('STRIP tok %s', tok)
diff --git a/osh/word_parse.py b/osh/word_parse.py
@@ -636,7 +636,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
             self._SetNext(lex_mode)
             self._GetToken()
 
-            # Kind.Char emitted in DOLLAR_SQ state
+            # Kind.Char emitted in lex_mode.SQ_C
             if self.token_kind in (Kind.Lit, Kind.Char):
                 tok = self.cur_token
                 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be

diff --git a/test/lossless.sh b/test/lossless.sh
@@ -30,7 +30,7 @@ _compare() {
 }
 
 test-sh() {
-  for file in test/lossless/*; do
+  for file in test/lossless/*.sh; do
     echo "--- $file"
     _compare $file
   done