Skip to content

Commit

Permalink
[ysh breaking] @() decodes J8 lines, instead of doing IFS splitting
Browse files Browse the repository at this point in the history
This is YSH style!

Invariant: any argv array can be represented.  This is not true with
IFS!  You would have to change this mutable global variable to represent
any string, which leads to data-dependent bugs.
  • Loading branch information
Andy Chu committed Apr 19, 2024
1 parent 0144322 commit 20d2da4
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 89 deletions.
156 changes: 98 additions & 58 deletions data_lang/j8.py
Expand Up @@ -583,7 +583,7 @@ def Next(self):
self.pos = end_pos
return tok_id, end_pos, None

def J8LinesNext(self):
def NextForLines(self):
# type: () -> Tuple[Id_t, int, Optional[str]]
""" Like Next(), but for J8 Lines """

Expand Down Expand Up @@ -734,14 +734,19 @@ def _Next(self):
def _Eat(self, tok_id):
# type: (Id_t) -> None

# TODO: Need location info
if self.tok_id != tok_id:
#log('position %r %d-%d %r', self.s, self.start_pos,
# self.end_pos, self.s[self.start_pos:self.end_pos])
raise self._Error("Expected %s, got %s" %
(Id_str(tok_id), Id_str(self.tok_id)))
self._Next()

def _NextForLines(self):
# type: () -> None
"""Like _Next, but use the J8 Lines lexer."""
self.start_pos = self.end_pos
self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()

def _Error(self, msg):
# type: (str) -> error.Decode
return error.Decode(msg, self.s, self.start_pos, self.end_pos)
Expand Down Expand Up @@ -1063,100 +1068,135 @@ def ParseNil8(self):


class J8LinesParser(_Parser):
"""
Line-based grammar:
"""Decode lines from a string with newlines.
We specify this with a grammar, to preserve location info and to reduce
allocations. (But note that unquoted_line is more like a LOOP than it is
grammatical.)
Grammar:
end = Op_Newline | Eol_Tok
empty_line = WS_Space? end
j8_line = Space? string Space? (Op_Newline | Eof_Real)
string = J8_String
| (Lit_Chars (Space (Lit_Chars | Left-*) )*
# special case: read until end token, but REMOVE trailing WS_Space
unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
j8_line = WS_Space? J8_String WS_Space? end
lines = (empty_line | unquoted_line | j8_line)*
where Lit_Chars is valid UTF-8
Note that "" and u'' here are not a decoded string, because the line
started with literals:
Notes:
(1) We disallow multiple strings on a line, like:
"json" "json2"
"json" unquoted
(2) Internal quotes are allowed on unquoted lines. Consider this line:
foo "" u''
Using the grammar style preserves location info, reduces allocations.
The "" and u'' are not a decoded string, because the line started with
Id.Lit_Chars literals.
- TODO: We something like LexerDecoder, but with a J8_LINES mode
- _Parser should take the outer lexer.
(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
Does it have - for empty cell?
"""

def __init__(self, s):
# type: (str) -> None
_Parser.__init__(self, s, True)

def ParseLine(self):
# type: () -> Optional[str]
pass
def _Show(self, s):
# type: (str) -> None
log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
self.end_pos)

def _Parse(self):
# type: () -> List[str]
def _ParseLine(self, out):
# type: (List[str]) -> None
""" May append a line to 'out' """
#self._Show('1')
if self.tok_id == Id.WS_Space:
self._NextForLines()

log('tok_id %s', Id_str(self.tok_id))
# Empty line - return without doing anything
if self.tok_id in (Id.Op_Newline, Id.Eol_Tok):
self._NextForLines()
return

if self.tok_id == Id.WS_Space:
self._Next()
# Quoted string on line
if self.tok_id == Id.J8_String:
out.append(self.decoded)
self._NextForLines()

if self.tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
Id.Left_USingleQuote):
#tok_id, end_pos, s = self._DecodeString(tok_id, end_pos)
#log('s %r', s)
pass
if self.tok_id == Id.WS_Space: # trailing whitespace
self._NextForLines()

if self.tok_id == Id.WS_Space:
self._Next()
pass
return None
if self.tok_id not in (Id.Op_Newline, Id.Eol_Tok):
raise self._Error('Unexpected text after J8 Line (%s)' %
Id_str(self.tok_id))

def _Show(self, s):
# type: (str) -> None
log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
self.end_pos)
self._NextForLines()
return

# Unquoted line
if self.tok_id == Id.Lit_Chars:
# ' unquoted "" text on line ' # read every token until end
string_start = self.start_pos
while True:
# for stripping whitespace
prev_id = self.tok_id
prev_start = self.start_pos

self._NextForLines()
if self.tok_id in (Id.Op_Newline, Id.Eol_Tok):
break

if prev_id == Id.WS_Space:
string_end = prev_start # remove trailing whitespace
else:
string_end = self.start_pos

out.append(self.s[string_start:string_end])

self._NextForLines() # past newline
return

raise AssertionError(Id_str(self.tok_id))

def Parse(self):
# type: () -> List[str]
""" Raises error.Decode. """
self._NextForLines()

self._Show('1')

# TODO: J8LinesNext()
self._Next()
self._Show('2')
lines = self._Parse()
lines = [] # type: List[str]
while self.tok_id != Id.Eol_Tok:
self._ParseLine(lines)

if self.tok_id != Id.Eol_Tok:
raise self._Error('Unexpected trailing input')
raise self._Error('Unexpected trailing input in J8 Lines')

return lines


def SplitJ8Lines(s):
# type: (str) -> List[str]
"""Used by @(echo split command sub)
3 Errors
Raises:
error.Decode
- quotes don't match on a line
- J8 syntax error inside quotes
3 Errors:
- J8 string syntax error inside quotes
- Extra input on line
- unquoted line isn't utf-8
Note that blank lines are IGNORED
Notes:
- This is related to TSV8? Similar rules. Does TSV8 have empty cells?
- It might have the - alias for empty cell?
Can we write this like a lexer, to preserve positions?
We just need a significant newline character, and then maybe an "outer"
lexer mode with \n and whitespace?
Or we can also just reuse the same LexerDecoder for each line? Reset()?
Then we get the line number for free.
"""
p = J8LinesParser(s)
# raises error.Decode
return p.Parse()


# vim: sw=4
2 changes: 1 addition & 1 deletion frontend/lexer_def.py
Expand Up @@ -602,7 +602,7 @@ def R(pat, tok_type):
R(r'[\n]', Id.Op_Newline),

# not space or ' or " or EOF
R(r'''[^ \r\t'"\0]+''', Id.Lit_Chars),
R(r'''[^ \t\r\n'"\0]+''', Id.Lit_Chars),
]

# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
Expand Down
11 changes: 8 additions & 3 deletions osh/word_eval.py
Expand Up @@ -54,7 +54,7 @@
from core import state
from core import ui
from core import util
#from data_lang import j8
from data_lang import j8
from data_lang import j8_lite
from core.error import e_die
from frontend import consts
Expand Down Expand Up @@ -2304,8 +2304,13 @@ def _EvalCommandSub(self, cs_part, quoted):

if cs_part.left_token.id == Id.Left_AtParen:
# YSH splitting algorithm: does not depend on IFS
#strs = j8.SplitJ8Lines(stdout_str)
strs = self.splitter.SplitForWordEval(stdout_str)
try:
strs = j8.SplitJ8Lines(stdout_str)
except error.Decode as e:
# status code 4 is special, for encode/decode errors.
raise error.Structured(4, e.Message(), cs_part.left_token)

#strs = self.splitter.SplitForWordEval(stdout_str)
return part_value.Array(strs)
else:
return Piece(stdout_str, quoted, not quoted)
Expand Down
39 changes: 23 additions & 16 deletions spec/ysh-command-sub.test.sh
@@ -1,4 +1,4 @@
## oils_failures_allowed: 1
## oils_failures_allowed: 0

#### Conflict with extglob @( can be avoided with ,(

Expand Down Expand Up @@ -39,13 +39,13 @@ x=axbxxc
argv.py $x

# This construct behaves the same with simple_word_eval on
# Not affected by IFS
echo --
shopt -s simple_word_eval
write -- @(seq 3)


echo --
write -- @(echo axbxxc)
argv.py @(echo $x)

## STDOUT:
1
Expand All @@ -58,38 +58,45 @@ write -- @(echo axbxxc)
2
3
--
a
b

c
['axbxxc']
## END

#### @() decodes J8 Lines

# syntax errors - TODO: document this in doc/chap-errors

# - quotes that don't match
# - syntax error inside quotes
# - syntax error in J8 string (bad escape, no closing quote)
# - extra text after line
# - unquoted line isn't valid UTF-8

var b = @(
# spaces stripped here
echo " unquoted ";
# unquoted: spaces stripped at beginning and end
echo ' unquoted "" word '

# empty line is ignored
echo

# Not allowed, since unquoted lines should be UTF-8
#echo $'binary \xff';

echo '"json\n\u03bc"';
echo "u'j8 u \\u{3bc}'";
echo "b'j8 b \\yff'";
echo ' " json \u03bc " '
echo " u' j8 u \\u{3bc} ' "
echo " b' j8 b \\u{3bc} ' "

# no quotes is same as u''
echo "'j8 u \\u{3bc}'";
# no prefix is like u''
echo " ' j8 no prefix \\u{3bc} ' "
)

json write (b)

## STDOUT:
[
"unquoted \"\" word",
" json μ ",
" j8 u μ ",
" j8 b μ ",
" j8 no prefix μ "
]
## END

#### for loop using @(seq $n)
Expand Down
31 changes: 23 additions & 8 deletions test/ysh-runtime-errors.sh
Expand Up @@ -917,23 +917,38 @@ pp line (d)
}

test-command-sub-j8() {
return
_ysh-should-run-here <<'EOF'
write @(echo ' "json\tstring" '; echo; echo " b'j8' "; echo ' unquoted ';)
EOF
#return

# quotes that don't match
_ysh-error-here-X 2 <<'EOF'
_ysh-error-here-X 4 <<'EOF'
var lines = @(
echo "\"unbalanced"
echo '"unbalanced'
)
pp line (lines)
EOF

# error in word language
_ysh-error-here-X 4 <<'EOF'
write @(echo '"unbalanced')
EOF

# can't have two strings on a line
_ysh-error-here-X 4 <<'EOF'
write @(echo '"json" "nope"')
EOF

_ysh-error-here-X 4 <<'EOF'
write @(echo '"json" unquoted')
EOF

# syntax error inside quotes
_ysh-error-here-X 2 <<'EOF'
var lines = @(
echo '"\"'
)
pp line (lines)
_ysh-error-here-X 4 <<'EOF'
write @(echo '"hello \z"')
EOF
return

# unquoted line isn't valid UTF-8
_ysh-error-here-X 2 <<'EOF'
Expand Down

0 comments on commit 20d2da4

Please sign in to comment.