Permalink
Browse files

Properly lex, parse and evaluate $''.

The lexing of backslash escapes is shared with 'echo -e' to the greatest
extent possible.

- Semi-automatically extract some tests for echo -e and $''.  I found
  some corner cases where bash's echo -e and $'' diverge.  (e.g. octal
  escape value out of bounds)
- core/lexer_gen.py: Properly translate regexes with {0,1}
- The SingleQuotedPart now includes the left quote token.
- Introduce EmptyPart for 'foo=' and "${a:-}", instead of reusing
  SingleQuotedPart.
- Update lint count scripts.
  • Loading branch information...
Andy Chu
Andy Chu committed Jan 8, 2018
1 parent 019dfc7 commit 5ea432799b3cd40033b892ec99f646aef0decfb9
Showing with 281 additions and 138 deletions.
  1. +0 −14 build/codegen.sh
  2. +4 −0 build/dev.sh
  3. +7 −69 core/builtin.py
  4. +15 −8 core/lexer_gen.py
  5. +2 −0 core/lexer_gen_test.py
  6. +1 −1 core/word.py
  7. +75 −0 core/word_compile.py
  8. +13 −1 core/word_eval.py
  9. +29 −0 gold/dollar-sq.sh
  10. +34 −0 gold/echo-e.sh
  11. +2 −1 osh/ast_.py
  12. +3 −3 osh/cmd_parse_test.py
  13. +36 −19 osh/lex.py
  14. +11 −0 osh/lex_test.py
  15. +3 −1 osh/osh.asdl
  16. +8 −6 osh/word_parse.py
  17. +6 −4 scripts/count.sh
  18. +22 −0 spec/builtin-io.test.sh
  19. +9 −9 test/gold.sh
  20. +1 −2 test/spec.sh
View
@@ -90,20 +90,6 @@ ast-id-lex() {
osh-lex-gen-native
}
lexer() {
ast-id-lex
# Why do we need this?
rm -f _devbuild/py-ext/x86_64/fastlex.so
# Note: This also does pylibc, which we don't want.
build/dev.sh fastlex
}
# TODO:
# asdl/run.sh gen-osh-python
# Size profiler for binaries. TODO: Fold this into benchmarks/
bloaty() { ~/git/other/bloaty/bloaty "$@"; }
View
@@ -72,6 +72,10 @@ pylibc() {
fastlex() {
build/codegen.sh ast-id-lex
# Why do we need this? It gets stail otherwise.
rm -f _devbuild/py-ext/x86_64/fastlex.so
py-ext fastlex build/setup_fastlex.py
PYTHONPATH=. native/fastlex_test.py
}
View
@@ -35,6 +35,7 @@
from core import runtime
from core import util
from core import state
from core import word_compile
from core.id_kind import Id
from osh import lex
@@ -49,7 +50,6 @@
e_die = util.e_die
# NOTE: NONE is a special value.
# TODO:
# - Make a table of name to enum? source, dot, etc.
@@ -222,73 +222,11 @@ def Resolve(argv0):
return EBuiltin.NONE
ECHO_LEXER = lexer.SimpleLexer(lex.C_STRING_DEF)
_ONE_CHAR = {
'0': '\0',
'a': '\a',
'b': '\b',
'e': '\x1b',
'E': '\x1b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
'\\': '\\',
}
# Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
# \0777 is a syntax error -- we shouldn't do modulus
# \d could be a syntax error -- it is better written as \\d
def _EvalStringPart(id_, value):
# TODO: This has to go in the compile stage for DOLLAR_SQ strings.
if id_ == Id.Char_OneChar:
c = value[1]
return _ONE_CHAR[c]
elif id_ == Id.Char_Stop: # \c returns a special sentinel
return None
elif id_ == Id.Char_Octal:
# TODO: Error checking for \0777
s = value[2:]
i = int(s, 8)
if i >= 256:
i = i % 256
# NOTE: This is for strict mode
#raise AssertionError('Out of range')
return chr(i)
elif id_ == Id.Char_Hex:
s = value[2:]
i = int(s, 16)
return chr(i)
elif id_ == Id.Char_Unicode4:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Unicode8:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Literals:
return value
else:
raise AssertionError
ECHO_LEXER = lexer.SimpleLexer(lex.ECHO_E_DEF)
echo_spec = _Register('echo')
echo_spec.ShortFlag('-e') # no backslash escapes
echo_spec.ShortFlag('-n')
ECHO_SPEC = _Register('echo')
ECHO_SPEC.ShortFlag('-e') # no backslash escapes
ECHO_SPEC.ShortFlag('-n')
def Echo(argv):
@@ -307,14 +245,14 @@ def Echo(argv):
# - 'echo -c' should print '-c', not fail
# - echo '---' should print ---, not fail
arg, arg_index = echo_spec.ParseLikeEcho(argv)
arg, arg_index = ECHO_SPEC.ParseLikeEcho(argv)
argv = argv[arg_index:]
if arg.e:
new_argv = []
for a in argv:
parts = []
for id_, value in ECHO_LEXER.Tokens(a):
p = _EvalStringPart(id_, value)
p = word_compile.EvalCStringToken(id_, value)
# Unusual behavior: '\c' prints what is there and aborts processing!
if p is None:
View
@@ -6,6 +6,7 @@
import cStringIO
import sys
import sre_parse
import sre_constants
from osh import lex
@@ -104,6 +105,7 @@ def TranslateConstant(pat):
return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
def TranslateTree(re_tree, f, in_char_class=False):
"""
re_tree: List of children
@@ -120,15 +122,20 @@ def TranslateTree(re_tree, f, in_char_class=False):
# min = 0 means *, min = 1 means +
assert min_ in (0, 1), min_
TranslateTree(children, f)
if min_ == 0:
if max_ == 1:
f.write('? ')
if min_ == 0 and max_ == 1:
f.write('? ')
elif max_ == sre_constants.MAXREPEAT:
if min_ == 0:
f.write('* ')
elif min_ == 1:
f.write('+ ')
else:
f.write('* ')
elif min_ == 1:
f.write('+ ')
else:
assert 0, min_
assert 0, min_
else: # re2c also supports [0-7]{1,2} syntax
f.write('{%d,%d} ' % (min_, max_))
elif name == 'negate': # ^ in [^a-z]
assert arg is None
View
@@ -33,6 +33,8 @@ def testTranslateRegex(self):
(r'[^a-z]', r'[^a-z]'),
# . isn't special inside
(r'[a\.]', r'[a.]'),
(r'[0-7]{1,3}', r'[0-7]{1,3} '),
]
for py, expected in PAIRS:
#self.assertEqual(expected, lexer_gen.TranslateRegex(py))
View
@@ -440,7 +440,7 @@ def LooksLikeAssignment(w):
# This fake SingleQuotedPart is necesssary so that EmptyUnquoted elision
# isn't applied. EMPTY= is like EMPTY=''.
# TODO: This part doesn't have spids, so it might break some invariants.
rhs.parts.append(ast.SingleQuotedPart())
rhs.parts.append(ast.EmptyPart())
else:
for p in w.parts[1:]:
rhs.parts.append(p)
View
@@ -0,0 +1,75 @@
#!/usr/bin/python
"""
word_compile.py
This is called the "compile" stage because it happens after parsing, but it
doesn't depend on any values at runtime.
"""
from core.id_kind import Id
_ONE_CHAR = {
'0': '\0',
'a': '\a',
'b': '\b',
'e': '\x1b',
'E': '\x1b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
'\\': '\\',
}
# TODO: Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
# \0777 is a syntax error -- we shouldn't do modulus
# \d could be a syntax error -- it is better written as \\d
def EvalCStringToken(id_, value):
"""
This function is shared between echo -e and $''.
$'' could use it at compile time, much like brace expansion in braces.py.
"""
if id_ == Id.Char_OneChar:
c = value[1]
return _ONE_CHAR[c]
elif id_ == Id.Char_Stop: # \c returns a special sentinel
return None
elif id_ == Id.Char_Octal:
# TODO: Error checking for \0777
s = value[2:]
i = int(s, 8)
if i >= 256:
i = i % 256
# NOTE: This is for strict mode
#raise AssertionError('Out of range')
return chr(i)
elif id_ == Id.Char_Hex:
s = value[2:]
i = int(s, 16)
return chr(i)
elif id_ == Id.Char_Unicode4:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Unicode8:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Literals:
return value
else:
raise AssertionError
View
@@ -12,6 +12,7 @@
from core.id_kind import Id, Kind, LookupKind
from core import runtime
from core import state
from core import word_compile
from core import util
from osh import ast_ as ast
@@ -616,8 +617,19 @@ def _EvalWordPart(self, part, part_vals, quoted=False):
v = runtime.StringPartValue(s, False)
part_vals.append(v)
elif part.tag == word_part_e.EmptyPart:
part_vals.append(runtime.StringPartValue('', False))
elif part.tag == word_part_e.SingleQuotedPart:
s = ''.join(t.val for t in part.tokens)
if part.left.id == Id.Left_SingleQuote:
s = ''.join(t.val for t in part.tokens)
elif part.left.id == Id.Left_DollarSingleQuote:
# NOTE: This could be done at compile time
s = ''.join(word_compile.EvalCStringToken(t.id, t.val)
for t in part.tokens)
else:
raise AssertionError(part.left.id)
v = runtime.StringPartValue(s, False)
part_vals.append(v)
View
@@ -1,6 +1,35 @@
#!/bin/bash
#
# Adapted from gold/echo-e.sh, which was adapted from spec/builtin-io.test.sh.
echo $'foo\tbar\n'
echo $'foo\tbar\n\
baz'
echo $'\\'
echo $'abc\ndef\n'
echo $'\a\b\d\e\f'
echo $'\n\r\t\v'
# Doesn't pass because Python can have NUL embedded in strings!
#echo $'ab\0cd' | od -A n -c | sed 's/ \+/ /g'
echo $'abcd\x65f'
echo $'abcd\044e'
echo $'abcd\u0065f'
echo $'abcd\U00000065f'
# In bash, these are different than echo -e. I'm not sure why yet.
#echo $'\03777' | od -A n -t x1 | sed 's/ \+/ /g'
#echo $'\04000' | od -A n -t x1 | sed 's/ \+/ /g'
#echo $'\0777' | od -A n -t x1 | sed 's/ \+/ /g'
echo $'abcd\x6' | od -A n -c | sed 's/ \+/ /g'
echo $'\x' $'\xg' | od -A n -c | sed 's/ \+/ /g'
echo $'abcd\04' | od -A n -c | sed 's/ \+/ /g'
echo $'abcd\u006' | od -A n -c | sed 's/ \+/ /g'
echo $'\u6' | od -A n -c | sed 's/ \+/ /g'
#echo $'\0' '\1' '\8' | od -A n -c | sed 's/ \+/ /g'
echo $'foo
bar'
echo $'foo\
bar'
View
@@ -0,0 +1,34 @@
#!/bin/bash
#
# Semi-automatically extracted like this:
#
# grep ^echo spec/builtin-io.test.sh.
#
# For converting to gold/dollar-sq.sh.
echo -e '\\'
echo -en 'abc\ndef\n'
echo -ez 'abc\n'
echo -e '\a\b\d\e\f'
echo -e '\n\r\t\v'
echo -e 'ab\0cd'
echo -e xy 'ab\cde' 'ab\cde'
echo -e 'abcd\x65f'
echo -e 'abcd\044e'
echo -e 'abcd\u0065f'
echo -e 'abcd\U00000065f'
echo -en '\03777' | od -A n -t x1 | sed 's/ \+/ /g'
echo -en '\04000' | od -A n -t x1 | sed 's/ \+/ /g'
echo -e '\0777' | od -A n -t x1 | sed 's/ \+/ /g'
echo -en 'abcd\x6' | od -A n -c | sed 's/ \+/ /g'
echo -e '\x' '\xg' | od -A n -c | sed 's/ \+/ /g'
echo -e 'abcd\04' | od -A n -c | sed 's/ \+/ /g'
echo -en 'abcd\u006' | od -A n -c | sed 's/ \+/ /g'
echo -e '\u6' | od -A n -c | sed 's/ \+/ /g'
echo -e '\0' '\1' '\8' | od -A n -c | sed 's/ \+/ /g'
echo -e 'foo
bar'
echo -e 'foo\
bar'
View
@@ -73,7 +73,8 @@ def AbbreviateNodes(obj, node):
for part in obj.parts:
node.unnamed_fields.append(MakeTree(part, AbbreviateNodes))
elif node.node_type == 'SingleQuotedPart':
# Only abbreviate 'foo', not $'foo\n'
elif node.node_type == 'SingleQuotedPart' and obj.left.id == Id.Left_SingleQuote:
node.abbrev = True
node.node_type = 'SQ'
Oops, something went wrong.

0 comments on commit 5ea4327

Please sign in to comment.