Permalink
Browse files

Prepare to implement 'read -r' and IFS splitting for 'read'.

Aboriginal Linux uses both of these mechanisms.

- More spec tests for the read builtin.  As usual, I uncovered shell
  incompatibilities, like whether they recognize C backslash-escapes in
  read input!

- Add unit tests for the read builtin.  For example, \ escapes IFS chars
  as well as newline.

- Create a legacy.py module for splitting by IFS.  'read' and unquoted
  word splicing will both share some of this logic.  At the very least,
  they both have the concept of "IFS whitespace".
  • Loading branch information...
Andy Chu
Andy Chu committed Jan 2, 2018
1 parent 52a4d11 commit 672988dc7de3fc88c4799816318cb6018284965c
Showing with 323 additions and 177 deletions.
  1. +41 −3 core/builtin.py
  2. +34 −0 core/builtin_test.py
  3. +102 −0 core/legacy.py
  4. +92 −0 core/legacy_test.py
  5. +3 −94 core/word_eval.py
  6. +2 −80 core/word_eval_test.py
  7. +49 −0 spec/builtin-io.test.sh
View
@@ -492,21 +492,57 @@ def Jobs(argv, job_state):
return 0
# Summary:
# - Split with IFS, except \ can escape them! This is different than the
# algorithm for splitting words (at least the way I've represented it.)
# - And
# Bash manual:
# - If there are more words than names, the remaining words and their
# intervening delimiters are assigned to the last name.
# - If there are fewer words read from the input stream than names, the
# remaining names are assigned empty values.
# - The characters in the value of the IFS variable are used to split the line
# into words using the same rules the shell uses for expansion (described
# above in Word Splitting).
# - The backslash character '\' may be used to remove any special meaning for
# the next character read and for line continuation.
#
# Hm but word splitting isn't affected by \<space>
#
# I think I have to make two passes.
#
# 1. Process backslashes (or don't if it's -r)
# 2. Split.
READ_SPEC = _Register('read')
READ_SPEC.ShortFlag('-r')
READ_SPEC.ShortFlag('-n', args.Int)
def _SplitLine(line, ifs, allow_escape):
continued = False
# Or should I just return a list of indices?
parts = []
n = len(line)
for i in xrange(n):
c = line[i]
return parts, continued
def Read(argv, mem):
# TODO:
# - Use IFS instead of Python's split().
arg, i = READ_SPEC.Parse(argv)
if not arg.r:
util.warn('*** read without -r not implemented ***')
names = argv[i:]
if arg.n is not None:
if arg.n is not None: # read a certain number of bytes
try:
name = names[0]
except IndexError:
@@ -518,6 +554,8 @@ def Read(argv, mem):
# NOTE: Even if we don't get n bytes back, there is no error?
return 0
# We have to read more than one line if there is a line continuation (and
# it's not -r).
line = sys.stdin.readline()
if not line: # EOF
return 1
View
@@ -17,6 +17,40 @@ def testEchoLexer(self):
print list(lex.Tokens(r'unicode \u0065 \U00000065'))
print list(lex.Tokens(r'\d \e \f \g'))
def testSplitLine(self):
# NOTE: This function can be rewritten in C or C++. Use ASAN / fuzzing?
# It's similar to the functions in core/glob_.py.
#
# Can you use regexes? Need different regexes for "allow_escape".
# Nah I think I need to rewrite _IfsSplit in word_eval.c.
# That is a very similar function.
#
# Can you lift it from dash? The other shells all GPL.
# word_eval_test._IfsSplit has at least one bug! With IFS='_ '. Maybe
# should test that here.
DEFAULT_IFS = ' \t\n'
OTHER_IFS = ':'
# allow_escape is True by default, but False when the user passes -r.
CASES = [
#(' one two ', DEFAULT_IFS, False, ['one', 'two'], False),
(' one:two ', OTHER_IFS, True, [' one', 'two '], False),
(' one\:two ', OTHER_IFS, True, [' one:two '], False),
(' one\:two ', OTHER_IFS, False, [r' one\', two '], False),
]
# Not worknig yet!
return
for line, ifs, allow_escape, expected_parts, expected_c in CASES:
parts, continued = builtin._SplitLine(line, ifs, allow_escape)
self.assertEqual(expected_parts, parts,
'%r: %s != %s' % (line, expected_parts, parts))
self.assertEqual(expected_c, continued,
'%r: %s != %s' % (line, expected_c, continued))
if __name__ == '__main__':
unittest.main()
View
@@ -0,0 +1,102 @@
#!/usr/bin/python
"""
legacy.py
"""
import re
from core import runtime
value_e = runtime.value_e
def GetIfs(mem):
"""
Used for splitting words in Splitter.
"""
val = mem.GetVar('IFS')
if val.tag == value_e.Undef:
return ''
elif val.tag == value_e.Str:
return val.s
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
def _Split(s, ifs):
"""Helper function for IFS split."""
parts = ['']
for c in s:
if c in ifs:
parts.append('')
else:
parts[-1] += c
return parts
def IfsSplit(s, ifs):
"""
http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05
https://www.gnu.org/software/bash/manual/bashref.html#Word-Splitting
Summary:
1. ' \t\n' is special. Whitespace is trimmed off the front and back.
2. if IFS is '', no field splitting is performed.
3. Otherwise, suppose IFS = ' ,\t'. Then IFS whitespace is space or comma.
a. IFS whitespace isgnored at beginning and end.
b. any other IFS char delimits the field, along with adjacent IFS
whitespace.
c. IFS whitespace shall delimit a field.
# Can we do this be regex or something? Use regex match?
"""
assert isinstance(ifs, str), ifs
if not ifs:
return [s] # no splitting
# print("IFS SPLIT %r %r" % (s, ifs))
# TODO: This detect if it's ALL whitespace? If ifs_other is empty?
if ifs == ' \t\n':
return _Split(s, ifs)
# Detect IFS whitespace
ifs_whitespace = ''
ifs_other = ''
for c in ifs:
if c in ' \t\n':
ifs_whitespace += c
else:
ifs_other += c
# TODO: Rule 3a. Ignore leading and trailing IFS whitespace?
# hack to make an RE
# Hm this escapes \t as \\\t? I guess that works.
ws_re = re.escape(ifs_whitespace)
other_re = re.escape(ifs_other)
#print('chars', repr(ifs_whitespace), repr(ifs_other))
#print('RE', repr(ws_re), repr(other_re))
# BUG: re.split() is the wrong model. It works with the 'delimiting' model.
# Forward iteration. TODO: grep for IFS in dash/mksh/bash/ash.
# ifs_ws | ifs_ws* non_ws_ifs ifs_ws*
if ifs_whitespace and ifs_other:
# first alternative is rule 3c.
# BUG: It matches the whitespace first?
pat = '[%s]+|[%s]*[%s][%s]*' % (ws_re, ws_re, other_re, ws_re)
elif ifs_whitespace:
pat = '[%s]+' % ws_re
elif ifs_other:
pat = '[%s]' % other_re
else:
raise AssertionError
#print('PAT', repr(pat))
regex = re.compile(pat)
frags = regex.split(s)
#log('split %r by %r -> frags %s', s, pat, frags)
return frags
View
@@ -0,0 +1,92 @@
#!/usr/bin/python -S
"""
legacy_test.py: Tests for legacy.py
"""
import unittest
from core import legacy # module under test
class SplitTest(unittest.TestCase):
def testIfsSplitEmpty(self):
self.assertEqual(
[''], legacy.IfsSplit('', ' \t\n'))
self.assertEqual(
['', ''], legacy.IfsSplit(' ', ' \t\n'))
self.assertEqual(
[''], legacy.IfsSplit('', ' '))
# No word splitting when no IFS. Hm.
self.assertEqual(
[''], legacy.IfsSplit('', ''))
def testIfsSplit(self):
self.assertEqual(
['', 'foo', 'bar', ''],
legacy.IfsSplit('\tfoo bar\n', ' \t\n'))
self.assertEqual(
['\tfoo bar\n'],
legacy.IfsSplit('\tfoo bar\n', ''))
self.assertEqual(
['a', '', 'd'],
legacy.IfsSplit('abcd', 'bc'))
def testIfsSplit_Mixed(self):
self.assertEqual(
['a', 'cd'],
legacy.IfsSplit('abcd', ' b'))
# IFS whitespace rule
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('abc', 'b '))
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('a c', 'b '))
self.assertEqual(
['a', '', 'c'],
legacy.IfsSplit('abbc', 'b '))
self.assertEqual(
['', 'a', '', '', 'cd', ''],
legacy.IfsSplit('\ta b\tb cd\n', 'b \t\n'))
self.assertEqual(
['', 'a', 'cd', ''],
legacy.IfsSplit('\tabcd\n', 'b \t\n'))
def testIfsSplit_Mixed2(self):
# Doesn't work yet
return
self.assertEqual(
['a', '', '', 'b'],
legacy.IfsSplit('a _ _ _ b', '_ '))
def testIfsSplitWhitespaceOnly(self):
# No non-whitespace IFS
self.assertEqual(
['', 'a', 'c', ''],
legacy.IfsSplit(' a c ', ' '))
self.assertEqual(
['', 'c'],
legacy.IfsSplit(' c', ' \t\n'))
def testIfsSplitNonWhitespaceOnly(self):
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('a_c', '_'))
self.assertEqual(
['', ''],
legacy.IfsSplit('_', '_'))
if __name__ == '__main__':
unittest.main()
Oops, something went wrong.

0 comments on commit 672988d

Please sign in to comment.