Permalink
Browse files

Implement echo -e, with extensive tests.

I use a simple regex lexer that can be ported to re2c eventually.

I found a bugs / divergent behavior in various shells:

- \1 - is \x01 or literal '\1'?
- \x - is it the NUL byte or literal '\x'?  Syntax error in C.
- is \E implemented?

TODO: What about \c?
  • Loading branch information...
Andy Chu
Andy Chu committed Dec 26, 2017
1 parent 7fd097b commit 9c8814da1d0b1bd74dd9814096e55fd4358dfcc8
Showing with 285 additions and 15 deletions.
  1. +100 −1 core/builtin.py
  2. +22 −0 core/builtin_test.py
  3. +8 −0 core/id_kind.py
  4. +1 −0 core/id_kind_test.py
  5. +28 −0 core/lexer.py
  6. +1 −1 doc/osh-quick-ref-toc.txt
  7. +8 −2 osh/lex.py
  8. +115 −9 spec/builtin-io.test.sh
  9. +2 −2 test/spec.sh
View
@@ -30,9 +30,11 @@
import sys
from core import args
from core import lexer
from core import runtime
from core import util
from core import state
from core.id_kind import Id
from osh import lex
@@ -218,6 +220,94 @@ def Resolve(argv0):
return EBuiltin.NONE
R = lexer.R
C = lexer.C
# TODO:
# - See if re2c supports {1,2} etc.
# - the DOLLAR_SQ lex state needs most of this logic.
ECHO_LEXER = lexer.SimpleLexer([
R(r'\\[0abeEfrtnv]', Id.Char_OneChar),
# Note: tokens above \0377 can either be truncated or be flagged a syntax
# error in strict mode.
R(r'\\0[0-7]{1,3}', Id.Char_Octal),
# \x6 is valid in bash
R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex),
R(r'\\u[0-9]{1,4}', Id.Char_Unicode4),
R(r'\\U[0-9]{1,8}', Id.Char_Unicode8),
R(r'[^\\]+', Id.Char_Literals),
R(r'\\.', Id.Char_Literals),
# Backslash that ends the string.
R('\\$', Id.Char_Literals),
# For re2c. TODO: need to make that translation.
C('\\\0', Id.Char_Literals),
])
_ONE_CHAR = {
'0': '\0',
'a': '\a',
'b': '\b',
'e': '\x1b',
'E': '\x1b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
}
# Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
# \0777 is a syntax error -- we shouldn't do modulus
# \d could be a syntax error -- it is better written as \\d
def _EvalStringPart(id_, value):
# TODO: This has to go in the compile stage for DOLLAR_SQ strings.
if id_ == Id.Char_OneChar:
c = value[1]
return _ONE_CHAR[c]
elif id_ == Id.Char_Octal:
# TODO: Error checking for \0777
s = value[2:]
i = int(s, 8)
if i >= 256:
i = i % 256
# NOTE: This is for strict mode
#raise AssertionError('Out of range')
return chr(i)
elif id_ == Id.Char_Hex:
s = value[2:]
i = int(s, 16)
return chr(i)
elif id_ == Id.Char_Unicode4:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Unicode8:
s = value[2:]
i = int(s, 16)
return unichr(i)
elif id_ == Id.Char_Literals:
return value
else:
raise AssertionError
echo_spec = _Register('echo')
echo_spec.ShortFlag('-e') # no backslash escapes
echo_spec.ShortFlag('-n')
@@ -235,7 +325,16 @@ def Echo(argv):
arg, i = echo_spec.ParseLikeEcho(argv)
if arg.e:
util.warn('*** echo -e not implemented ***')
new_argv = []
for a in argv:
parts = []
for id_, value in ECHO_LEXER.Tokens(a):
p = _EvalStringPart(id_, value)
parts.append(p)
new_argv.append(''.join(parts))
# Replace it
argv = new_argv
#log('echo argv %s', argv)
n = len(argv)
View
@@ -0,0 +1,22 @@
#!/usr/bin/python -S
"""
builtin_test.py: Tests for builtin.py
"""
import unittest
from core import lexer
from core import builtin # module under test
class BuiltinTest(unittest.TestCase):
def testEchoLexer(self):
lex = builtin.ECHO_LEXER
print list(lex.Tokens(r'newline \n NUL \0 octal \0377 hex \x00'))
print list(lex.Tokens(r'unicode \u0065 \U00000065'))
print list(lex.Tokens(r'\d \e \f \g'))
if __name__ == '__main__':
unittest.main()
View
@@ -7,6 +7,9 @@
# http://www.apache.org/licenses/LICENSE-2.0
"""
id_kind.py - Id and Kind definitions, used for Token, Word, Nodes, etc.
NOTE: If this changes, the lexer may need to be recompiled with
build/codegen.sh lexer.
"""
from core import util
@@ -394,6 +397,11 @@ def _AddKinds(spec):
# dynamically-resolved builtins.
spec.AddKind('ControlFlow', ['Break', 'Continue', 'Return'])
# For C-escaped strings.
spec.AddKind('Char', [
'OneChar', 'Hex', 'Octal', 'Unicode4', 'Unicode8', 'Literals'
])
# Id -> OperandType
BOOL_OPS = {} # type: dict
View
@@ -78,6 +78,7 @@ def MakeLookup(p):
lookup2 = MakeLookup(id_kind.ID_SPEC.LexerPairs(Kind.BoolBinary))
self.assertEqual(Id.BoolBinary_eq, lookup2['-eq'])
#print(lookup2)
def testPrintStats(self):
k = id_kind._kind_sizes
View
@@ -233,3 +233,31 @@ def Read(self, lex_mode):
#log('Read() Returning %s', t)
return t
# Based on osh/parse_lib.py MatchToken_Slow.
class SimpleLexer(object):
"""
Use Cases:
- echo -e, which interprets C-escaped strings.
- read -r
"""
def __init__(self, pat_list):
self.pat_list = CompileAll(pat_list)
def Tokens(self, line):
"""Yields tokens."""
pos = 0
n = len(line)
while pos < n:
matches = []
for regex, tok_type in self.pat_list:
m = regex.match(line, pos) # left-anchored
if m:
matches.append((m.end(0), tok_type, m.group(0)))
if not matches:
raise AssertionError(
'no match at position %d: %r (%r)' % (pos, line, line[pos]))
end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
yield tok_type, line[pos:end_pos]
pos = end_pos
@@ -88,7 +88,7 @@ ENVIRONMENT VARIABLES
[Process State] UID EUID
[cd] PWD OLDPWD
[getopts] OPTIND OPTARG
[read] REPLY
[read] REPLY IFS
[Other] IFS
[Libaries] X RANDOM TODO:TIMESTAMP
View
@@ -3,8 +3,14 @@
It consists of a series of lexer modes, each with a regex -> Id mapping.
NOTE: Input Handling Affects Regular Expressions
------------------------------------------------
NOTE: If this changes, the lexer may need to be recompiled with
build/codegen.sh lexer.
Input Handling
--------------
Note that our style of input Handling affects the regular expressions in the
lexer.
We pass one line at a time to the Lexer, via LineLexer. We must be able to
parse one line at a time because of interactive parsing (e.g. using the output
View
@@ -8,20 +8,111 @@ echo -
echo --
echo ---
# stdout-json: "-\n--\n---\n"
# BUG zsh stdout-json: "\n--\n---\n"
### echo -en
echo -en 'abc\ndef\n'
# stdout-json: "abc\ndef\n"
# N-I dash stdout-json: "-en abc\ndef\n\n"
### echo -ez (invalid flag)
# bash differs from the other two shells, but its behavior is possibly more
# sensible, if you're going to ignore the error. It doesn't make sense for the
# 'e' to mean 2 different things simultaneously: flag and literal to be
# bash differs from the other three shells, but its behavior is possibly more
# sensible, if you're going to ignore the error. It doesn't make sense for
# the 'e' to mean 2 different things simultaneously: flag and literal to be
# printed.
echo -ez 'abc\n'
# stdout-json: "-ez abc\\n\n"
# BUG dash/mksh stdout-json: "-ez abc\n\n"
# OK dash/mksh/zsh stdout-json: "-ez abc\n\n"
### echo -e with C escapes
# https://www.gnu.org/software/bash/manual/bashref.html#Bourne-Shell-Builtins
# not sure why \c is like NUL?
# zsh doesn't allow \E for some reason.
echo -e '\a\b\d\e\f'
# stdout-json: "\u0007\u0008\\d\u001b\u000c\n"
# N-I dash stdout-json: "-e \u0007\u0008\\d\\e\u000c\n"
### echo -e with whitespace C escapes
echo -e '\n\r\t\v'
# stdout-json: "\n\r\t\u000b\n"
# N-I dash stdout-json: "-e \n\r\t\u000b\n"
### \0
echo -e 'ab\0cd'
# stdout-json: "ab\u0000cd\n"
# dash truncates it
# BUG dash stdout-json: "-e ab\n"
### echo -e with hex escape
echo -e 'abcd\x65f'
# stdout-json: "abcdef\n"
# N-I dash stdout-json: "-e abcd\\x65f\n"
### echo -e with octal escape
echo -e 'abcd\044e'
# stdout-json: "abcd$e\n"
# OK dash stdout-json: "-e abcd$e\n"
### echo -e with 4 digit unicode escape
echo -e 'abcd\u0065f'
# stdout-json: "abcdef\n"
# OK dash stdout-json: "-e abcd\\u0065f\n"
### echo -e with 8 digit unicode escape
echo -e 'abcd\U00000065f'
# stdout-json: "abcdef\n"
# OK dash stdout-json: "-e abcd\\U00000065f\n"
### \0377 is the highest octal byte
echo -en '\03777' | od -A n -t x1 | sed 's/ \+/ /g'
# stdout-json: " ff 37\n"
# N-I dash stdout-json: " 2d 65 6e 20 ff 37 0a\n"
### \0400 is one more than the highest octal byte
# It is 256 % 256 which gets interpreted as a NUL byte.
echo -en '\04000' | od -A n -t x1 | sed 's/ \+/ /g'
# stdout-json: " 00 30\n"
# N-I dash stdout-json: " 2d 65 6e 20\n"
### \0777 is out of range
echo -en '\0777' | od -A n -t x1 | sed 's/ \+/ /g'
# stdout-json: " ff\n"
# OK mksh stdout-json: " c3 bf\n"
# OK dash stdout-json: " 2d 65 6e 20 ff 0a\n"
### incomplete hex escape
echo -en 'abcd\x6' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " a b c d 006\n"
# N-I dash stdout-json: " - e n a b c d \\ x 6 \\n\n"
### \x
# I consider mksh and zsh a bug because \x is not an escape
echo -e '\x' '\xg' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " \\ x \\ x g \\n\n"
# N-I dash stdout-json: " - e \\ x \\ x g \\n\n"
# BUG mksh/zsh stdout-json: " \\0 \\0 g \\n\n"
### incomplete octal escape
echo -en 'abcd\04' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " a b c d 004\n"
# OK dash stdout-json: " - e n a b c d 004 \\n\n"
### incomplete unicode escape
echo -en 'abcd\u006' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " a b c d 006\n"
# N-I dash stdout-json: " - e n a b c d \\ u 0 0 6 \\n\n"
### \u6
echo -e '\u6' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " 006 \\n\n"
# N-I dash stdout-json: " - e \\ u 6 \\n\n"
### \0 \1 \8
# \0 is special, but \1 isn't in bash
# \1 is special in dash! geez
echo -e '\0' '\1' '\8' | od -A n -c | sed 's/ \+/ /g'
# stdout-json: " \\0 \\ 1 \\ 8 \\n\n"
# BUG dash stdout-json: " - e 001 \\ 8 \\n\n"
### Read builtin
# NOTE: there are TABS below
@@ -60,17 +151,13 @@ echo /$x/$y/$z/
# stdout: /A/B//
# status: 0
### get umask
umask | grep '[0-9]\+' # check for digits
# status: 0
### Read -n (with $REPLY)
echo 12345 > $TMP/readn.txt
read -n 4 x < $TMP/readn.txt
read -n 2 < $TMP/readn.txt # Do it again with no variable
argv.py $x $REPLY
# stdout: ['1234', '12']
# N-I dash stdout: []
# N-I dash/zsh stdout: []
### read -r ignores backslashes
echo 'one\ two' > $TMP/readr.txt
@@ -79,3 +166,22 @@ read -r raw < $TMP/readr.txt
argv "$escaped" "$raw"
# stdout: ['one two', 'one\\ two']
### read -r with other backslash escapes
echo 'one\ two\x65three' > $TMP/readr.txt
read escaped < $TMP/readr.txt
read -r raw < $TMP/readr.txt
argv "$escaped" "$raw"
# mksh respects the hex escapes here, but other shells don't!
# stdout: ['one twox65three', 'one\\ two\\x65three']
# BUG mksh/zsh stdout: ['one twoethree', 'one\\ twoethree']
### read -r with \n
echo '\nline' > $TMP/readr.txt
read escaped < $TMP/readr.txt
read -r raw < $TMP/readr.txt
argv "$escaped" "$raw"
# dash/mksh/zsh are bugs because at least the raw mode should let you read a
# literal \n.
# stdout: ['nline', '\\nline']
# BUG dash/mksh/zsh stdout: ['', '']
Oops, something went wrong.

0 comments on commit 9c8814d

Please sign in to comment.