Skip to content

Commit

Permalink
[frontend refactor] Remove Id.Char_Literals in favor of Id.Lit_Chars
Browse files Browse the repository at this point in the history
This will make whitespace stripping and the lossles variant more
uniform.
  • Loading branch information
Andy C committed Mar 17, 2024
1 parent 9458978 commit 77b9f7e
Show file tree
Hide file tree
Showing 10 changed files with 22 additions and 25 deletions.
9 changes: 4 additions & 5 deletions builtin/printf_osh.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time as time_ # avoid name conflict

from _devbuild.gen import arg_types
from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t, Kind_t
from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
from _devbuild.gen.runtime_asdl import cmd_value
from _devbuild.gen.syntax_asdl import (
loc,
Expand Down Expand Up @@ -131,9 +131,8 @@ def Parse(self):
self._Next(lex_mode_e.PrintfOuter)
parts = [] # type: List[printf_part_t]
while True:
if (self.token_kind == Kind.Char or
self.token_type == Id.Format_EscapedPercent or
self.token_type == Id.Unknown_Backslash):
if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):

# Note: like in echo -e, we don't fail with Unknown_Backslash here
# when shopt -u parse_backslash because it's at runtime rather than
Expand All @@ -151,7 +150,7 @@ def Parse(self):
break

else:
raise AssertionError(self.token_type)
raise AssertionError(Id_str(self.token_type))

self._Next(lex_mode_e.PrintfOuter)

Expand Down
2 changes: 1 addition & 1 deletion builtin/read_osh.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def _MaybeDecodeLine(self, line):
lexer = self.parse_ctx.MakeLexer(line_reader)

# The parser only yields valid tokens:
# Char_Literals, Char_OneChar, Char_Hex, Char_UBraced
# Char_OneChar, Char_Hex, Char_UBraced
# So we can use word_compile.EvalCStringToken, which is also used for
# $''.
# Important: we don't generate Id.Unknown_Backslash because that is valid
Expand Down
2 changes: 1 addition & 1 deletion core/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def UserErrorString(self):
class Decode(Exception):
"""
List of J8 errors errors:
- message isn't UTF-8 - Id.Char_Literals - need loc
- message isn't UTF-8 - Id.Lit_Chars - need loc
- Invalid token Id.Unkown_Tok - need loc
- Unclosed double quote string -- need loc
- Parse error, e.g. [}{]
Expand Down
2 changes: 1 addition & 1 deletion data_lang/j8.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ def _DecodeString(self, left_id, str_pos):
# Now handle each kind of token
#

if tok_id == Id.Char_Literals: # JSON and J8
if tok_id == Id.Lit_Chars: # JSON and J8
part = self.s[str_pos:str_end]
if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
# Syntax error because JSON must be valid UTF-8
Expand Down
1 change: 0 additions & 1 deletion frontend/id_kind_def.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,6 @@ def AddKinds(spec):
'Unicode8', # bash
'UBraced',
'Pound', # YSH
'Literals',
'AsciiControl', # \x01-\x1f, what's disallowed in JSON
])

Expand Down
14 changes: 7 additions & 7 deletions frontend/lexer_def.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
After changing this file, run:
build/dev.sh all
build/py.sh all
or at least:
build/dev.sh fastlex
build/py.sh fastlex
Input Handling
--------------
Expand Down Expand Up @@ -497,7 +497,7 @@ def R(pat, tok_type):
C(r'\c', Id.Char_Stop),

# e.g. 'foo', anything that's not a backslash escape
R(r'[^\\\0]+', Id.Char_Literals),
R(r'[^\\\0]+', Id.Lit_Chars),
]

# https://json.org/
Expand Down Expand Up @@ -598,7 +598,7 @@ def R(pat, tok_type):
_ASCII_CONTROL,

# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
R(r'''[^\\'\0]+''', Id.Char_Literals),
R(r'''[^\\'\0]+''', Id.Lit_Chars),
]

# For "JSON strings \" \u1234"
Expand All @@ -616,7 +616,7 @@ def R(pat, tok_type):
_ASCII_CONTROL,

# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
R(r'[^\\"\0]+', Id.Char_Literals),
R(r'[^\\"\0]+', Id.Lit_Chars),
R(r'[^\0]', Id.Unknown_Tok),
]

Expand Down Expand Up @@ -654,13 +654,13 @@ def R(pat, tok_type):
C(r'\"', Id.Char_OneChar),

# e.g. 'foo', anything that's not a backslash escape or '
R(r"[^\\'\0]+", Id.Char_Literals),
R(r"[^\\'\0]+", Id.Lit_Chars),
C("'", Id.Right_SingleQuote),
]

LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
R(OCTAL3_RE, Id.Char_Octal3),
R(r"[^%\\\0]+", Id.Char_Literals),
R(r"[^%\\\0]+", Id.Lit_Chars),
C('%%', Id.Format_EscapedPercent),
C('%', Id.Format_Percent),
]
Expand Down
2 changes: 1 addition & 1 deletion frontend/lexer_def_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def testMode_DollarSq(self):

t = lexer.Read(lex_mode_e.SQ_C)
print(t)
self.assertTokensEqual(FakeTok(Id.Char_Literals, 'foo bar'), t)
self.assertTokensEqual(FakeTok(Id.Lit_Chars, 'foo bar'), t)

t = lexer.Read(lex_mode_e.SQ_C)
print(t)
Expand Down
11 changes: 5 additions & 6 deletions osh/word_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def EvalCStringToken(id_, value):
$'' could use it at compile time, much like brace expansion in braces.py.
"""
if id_ in (Id.Char_Literals, Id.Unknown_Backslash, Id.Char_AsciiControl):
if id_ in (Id.Lit_Chars, Id.Unknown_Backslash, Id.Char_AsciiControl):
# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.

# Char_AsciiControl is allowed in YSH code, for newlines in u''
Expand Down Expand Up @@ -246,8 +246,7 @@ def RemoveLeadingSpaceSQ(tokens):
arena.
Quirk to make more consistent:
In $''', we have Char_Literals \n
In r''' and ''', we have Lit_Chars \n
In $''' and r''' and ''', we have Lit_Chars \n
In u''' and b''', we have Char_AsciiControl \n
"""
if 0:
Expand All @@ -263,14 +262,14 @@ def RemoveLeadingSpaceSQ(tokens):
# x
# '''
first = tokens[0]
if first.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
if first.id in (Id.Lit_Chars, Id.Char_AsciiControl):
if _IsTrailingSpace(first):
tokens.pop(0) # Remove the first part

# Figure out what to strip, based on last token
last = tokens[-1]
to_strip = None # type: Optional[str]
if last.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
if last.id in (Id.Lit_Chars, Id.Char_AsciiControl):
if _IsLeadingSpace(last):
to_strip = lexer.TokenVal(last)
tokens.pop() # Remove the last part
Expand All @@ -288,7 +287,7 @@ def RemoveLeadingSpaceSQ(tokens):
if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
tok.col = n
tok.length -= n
# TODO:
# Lit_Chars -> Lit_CharsWithoutPrefix
# Char_Literals -> Char_LitStripped
#
#log('STRIP tok %s', tok)
2 changes: 1 addition & 1 deletion osh/word_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
self._SetNext(lex_mode)
self._GetToken()

# Kind.Char emitted in DOLLAR_SQ state
# Kind.Char emitted in lex_mode.SQ_C
if self.token_kind in (Kind.Lit, Kind.Char):
tok = self.cur_token
# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
Expand Down
2 changes: 1 addition & 1 deletion test/lossless.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ _compare() {
}

test-sh() {
for file in test/lossless/*; do
for file in test/lossless/*.sh; do
echo "--- $file"
_compare $file
done
Expand Down

0 comments on commit 77b9f7e

Please sign in to comment.