Skip to content

Commit

Permalink
[ysh] Preparing for @() to decode J8 lines
Browse files Browse the repository at this point in the history
Add some tests - not hooked up yet.

Fix J8 lexer error message.
  • Loading branch information
Andy Chu committed Apr 17, 2024
1 parent 9152b7e commit 04d9509
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 20 deletions.
66 changes: 62 additions & 4 deletions data_lang/j8.py
Expand Up @@ -543,11 +543,11 @@ class LexerDecoder(object):
string
"""

def __init__(self, s, is_j8):
# type: (str, bool) -> None
def __init__(self, s, is_j8, lang_str):
# type: (str, bool, str) -> None
self.s = s
self.is_j8 = is_j8
self.lang_str = "NIL8"
self.lang_str = lang_str

self.pos = 0
# Reuse this instance to save GC objects. JSON objects could have
Expand Down Expand Up @@ -698,7 +698,7 @@ def __init__(self, s, is_j8):
self.is_j8 = is_j8
self.lang_str = "J8" if is_j8 else "JSON"

self.lexer = LexerDecoder(s, is_j8)
self.lexer = LexerDecoder(s, is_j8, self.lang_str)
self.tok_id = Id.Undefined_Tok
self.start_pos = 0
self.end_pos = 0
Expand Down Expand Up @@ -1050,4 +1050,62 @@ def ParseNil8(self):
return obj


# types of lines
UNQUOTED = 0
SINGLE = 1
DOUBLE = 2

def SplitJ8Lines(s):
# type: (str) -> List[str]
"""Used by @(echo split command sub)
3 Errors
- quotes don't match on a line
- J8 syntax error inside quotes
- unquoted line isn't utf-8
Note that blank lines are IGNORED
Notes:
- This is related to TSV8? Similar rules. Does TSV8 have empty cells?
- It might have the - alias for empty cell?
"""
lines = s.split('\n')
strs = [] # type: List[str]
for line in lines:
# strip leading and trailing whitespace, no matter what
line = line.strip()

if len(line) == 0:
# always skip blank lines - empty strings should be "" or ''
continue

left_type = UNQUOTED
if (line.startswith("u'") or line.startswith("b'") or line.startswith("'")):
left_type = SINGLE
elif line.startswith('"'):
left_type = DOUBLE

right_type = UNQUOTED
if line.endswith("'"):
right_type = SINGLE
elif line.endswith('"'):
right_type = DOUBLE

if left_type != right_type:
# TODO: position
raise error.Decode("Mismatched quotes in J8 Lines", line, 0, 0)

if left_type == UNQUOTED:
# TODO: validate UTF-8
out_str = line
else:
# Decode j8
out_str = line
strs.append(out_str)

return strs


# vim: sw=4
18 changes: 9 additions & 9 deletions data_lang/j8_test.py
Expand Up @@ -149,37 +149,37 @@ def testJ8(self):
print(obj)

def testLexerDecoder(self):
lex = j8.LexerDecoder(r'{"hi": "bye \n"}', True)
lex = j8.LexerDecoder(r'{"hi": "bye \n"}', True, 'J8')
_PrintTokens(lex)

lex = j8.LexerDecoder(r"{u'unicode': b'bytes \y1f \yff'}", True)
lex = j8.LexerDecoder(r"{u'unicode': b'bytes \y1f \yff'}", True, 'J8')
_PrintTokens(lex)

lex = j8.LexerDecoder(
r'{"mu \u03BC \u0001":' + r"b'mu \u{03bc} \u{2620}'", True)
r'{"mu \u03BC \u0001":' + r"b'mu \u{03bc} \u{2620}'", True, 'J8')
_PrintTokens(lex)

lex = j8.LexerDecoder(r'{"x": [1, 2, 3.14, true]}', True)
lex = j8.LexerDecoder(r'{"x": [1, 2, 3.14, true]}', True, 'J8')
_PrintTokens(lex)

lex = j8.LexerDecoder(
r'''
[
1e9, 1e-9, -1e9, -1E-9, 42
]
''', True)
''', True, 'J8')
_PrintTokens(lex)

try:
lex = j8.LexerDecoder('"\x01"', True)
lex = j8.LexerDecoder('"\x01"', True, 'J8')
_PrintTokens(lex)
except error.Decode as e:
print(e)
else:
self.fail('Expected failure')

try:
lex = j8.LexerDecoder('"\x1f"', True)
lex = j8.LexerDecoder('"\x1f"', True, 'J8')
_PrintTokens(lex)
except error.Decode as e:
print(e)
Expand All @@ -197,11 +197,11 @@ def testMoreTokens(self):
'(Node left:(-> 123))',
]
for s in cases:
lex = j8.LexerDecoder(s, True)
lex = j8.LexerDecoder(s, True, 'J8')
_PrintTokens(lex)

def testErrorMessagePosition(self):
lex = j8.LexerDecoder("[ u'hi']", False)
lex = j8.LexerDecoder("[ u'hi']", False, 'J8')
try:
_PrintTokens(lex)
except error.Decode as e:
Expand Down
2 changes: 1 addition & 1 deletion doc/io-builtins.md
Expand Up @@ -45,7 +45,7 @@ These are discussed in more detail the [strings](strings.html) doc.

Example:

hostname | read --all :x
hostname | read --all (&x)
write -- $x

## Summary of YSH features
Expand Down
2 changes: 1 addition & 1 deletion osh/split.py
Expand Up @@ -179,7 +179,7 @@ def SplitForWordEval(self, s, ifs=None):
# type: (str, Optional[str]) -> List[str]
"""Split used by word evaluation.
Also used by the explicit @split() function.
Also used by the explicit shSplit() function.
"""
sp = self._GetSplitter(ifs=ifs)
spans = sp.Split(s, True)
Expand Down
4 changes: 4 additions & 0 deletions osh/word_eval.py
Expand Up @@ -54,6 +54,7 @@
from core import state
from core import ui
from core import util
from data_lang import j8
from data_lang import j8_lite
from core.error import e_die
from frontend import consts
Expand Down Expand Up @@ -2300,7 +2301,10 @@ def CheckCircularDeps(self):
def _EvalCommandSub(self, cs_part, quoted):
# type: (CommandSub, bool) -> part_value_t
stdout_str = self.shell_ex.RunCommandSub(cs_part)

if cs_part.left_token.id == Id.Left_AtParen:
# YSH splitting algorithm: does not depend on IFS
#strs = j8.SplitJ8Lines(stdout_str)
strs = self.splitter.SplitForWordEval(stdout_str)
return part_value.Array(strs)
else:
Expand Down
20 changes: 16 additions & 4 deletions spec/ysh-command-sub.test.sh
Expand Up @@ -66,16 +66,28 @@ c

#### @() decodes J8 Lines

# syntax errors - TODO: document this in doc/chap-errors

# - quotes that don't match
# - syntax error inside quotes
# - unquoted line isn't valid UTF-8

var b = @(
# spaces stripped here
echo " unquoted ";
# I guess this is allowed
echo $'binary \xff';

# Not allowed, since unquoted lines should be UTF-8
#echo $'binary \xff';

echo '"json\n\u03bc"';
echo "u'j8 u \\u{3bc}'";
echo "b'j8 b \\y{ff'";
echo "b'j8 b \\yff'";

# no quotes is same as u''
echo "'j8 u \\u{3bc}'";
)

pp line (b)
json write (b)

## STDOUT:
## END
Expand Down
29 changes: 29 additions & 0 deletions test/ysh-runtime-errors.sh
Expand Up @@ -916,6 +916,35 @@ pp line (d)
'
}

test-command-sub-j8() {
return

# quotes that don't match
_ysh-error-here-X 2 <<'EOF'
var lines = @(
echo "\"unbalanced"
)
pp line (lines)
EOF

# syntax error inside quotes
_ysh-error-here-X 2 <<'EOF'
var lines = @(
echo '"\"'
)
pp line (lines)
EOF

# unquoted line isn't valid UTF-8
_ysh-error-here-X 2 <<'EOF'
var lines = @(
echo $'\xff'
)
pp line (lines)
EOF

}

soil-run-py() {
run-test-funcs
}
Expand Down
6 changes: 5 additions & 1 deletion ysh/expr_eval.py
Expand Up @@ -53,6 +53,7 @@
from core import state
from core import ui
from core import vm
from data_lang import j8
from frontend import lexer
from frontend import match
from frontend import typed_args
Expand Down Expand Up @@ -1027,8 +1028,11 @@ def _EvalExpr(self, node):
else:
stdout_str = self.shell_ex.RunCommandSub(node)
if id_ == Id.Left_AtParen: # @(seq 3)
# TODO: Should use J8 lines
# YSH splitting algorithm: does not depend on IFS
#strs = j8.SplitJ8Lines(stdout_str)

strs = self.splitter.SplitForWordEval(stdout_str)

items = [value.Str(s)
for s in strs] # type: List[value_t]
return value.List(items)
Expand Down

0 comments on commit 04d9509

Please sign in to comment.