View
@@ -8,6 +8,7 @@
from core import braces
from core import expr_eval
from core import legacy
from core import glob_
from core.id_kind import Id, Kind, LookupKind
from core import runtime
@@ -42,20 +43,6 @@ def _ValueToPartValue(val, quoted):
raise AssertionError
def _GetIfs(mem):
"""
Used for splitting words in Splitter.
"""
val = mem.GetVar('IFS')
if val.tag == value_e.Undef:
return ''
elif val.tag == value_e.Str:
return val.s
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
def _GetJoinChar(mem):
"""
For decaying arrays by joining, eg. "$@" -> $@.
@@ -79,84 +66,6 @@ def _GetJoinChar(mem):
raise AssertionError("IFS shouldn't be an array")
def _Split(s, ifs):
"""Helper function for IFS split."""
parts = ['']
for c in s:
if c in ifs:
parts.append('')
else:
parts[-1] += c
return parts
def _IfsSplit(s, ifs):
"""
http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05
https://www.gnu.org/software/bash/manual/bashref.html#Word-Splitting
Summary:
1. ' \t\n' is special. Whitespace is trimmed off the front and back.
2. if IFS is '', no field splitting is performed.
3. Otherwise, suppose IFS = ' ,\t'. Then IFS whitespace is space or comma.
a. IFS whitespace isgnored at beginning and end.
b. any other IFS char delimits the field, along with adjacent IFS
whitespace.
c. IFS whitespace shall delimit a field.
# Can we do this be regex or something? Use regex match?
"""
assert isinstance(ifs, str), ifs
if not ifs:
return [s] # no splitting
# print("IFS SPLIT %r %r" % (s, ifs))
# TODO: This detect if it's ALL whitespace? If ifs_other is empty?
if ifs == ' \t\n':
return _Split(s, ifs)
# Detect IFS whitespace
ifs_whitespace = ''
ifs_other = ''
for c in ifs:
if c in ' \t\n':
ifs_whitespace += c
else:
ifs_other += c
# TODO: Rule 3a. Ignore leading and trailing IFS whitespace?
# hack to make an RE
# Hm this escapes \t as \\\t? I guess that works.
ws_re = re.escape(ifs_whitespace)
other_re = re.escape(ifs_other)
#print('chars', repr(ifs_whitespace), repr(ifs_other))
#print('RE', repr(ws_re), repr(other_re))
# BUG: re.split() is the wrong model. It works with the 'delimiting' model.
# Forward iteration. TODO: grep for IFS in dash/mksh/bash/ash.
# ifs_ws | ifs_ws* non_ws_ifs ifs_ws*
if ifs_whitespace and ifs_other:
# first alternative is rule 3c.
# BUG: It matches the whitespace first?
pat = '[%s]+|[%s]*[%s][%s]*' % (ws_re, ws_re, other_re, ws_re)
elif ifs_whitespace:
pat = '[%s]+' % ws_re
elif ifs_other:
pat = '[%s]' % other_re
else:
raise AssertionError
#print('PAT', repr(pat))
regex = re.compile(pat)
frags = regex.split(s)
#log('split %r by %r -> frags %s', s, pat, frags)
return frags
def _SplitPartsIntoFragments(part_vals, ifs):
"""
part_value[] -> part_value[]
@@ -168,7 +77,7 @@ def _SplitPartsIntoFragments(part_vals, ifs):
if p.tag == part_value_e.StringPartValue:
#log("SPLITTING %s with ifs %r", p, ifs)
if p.do_split_elide:
frags = _IfsSplit(p.s, ifs)
frags = legacy.IfsSplit(p.s, ifs)
res = [runtime.fragment(f, True, p.do_glob) for f in frags]
#log("RES %s", res)
else:
@@ -1074,7 +983,7 @@ def _EvalWordAndReframe(self, word):
"""
part_vals = self._EvalParts(word)
#log('part_vals after _EvalParts %s', part_vals)
ifs = _GetIfs(self.mem)
ifs = legacy.GetIfs(self.mem)
frag_arrays = _SplitPartsIntoFragments(part_vals, ifs)
#log('Fragments after split: %s', frag_arrays)
frag_arrays = _Reframe(frag_arrays)
View
@@ -14,85 +14,7 @@
import word_eval # module under test
class SplitTest(unittest.TestCase):
def testIfsSplitEmpty(self):
self.assertEqual(
[''], word_eval._IfsSplit('', ' \t\n'))
self.assertEqual(
['', ''], word_eval._IfsSplit(' ', ' \t\n'))
self.assertEqual(
[''], word_eval._IfsSplit('', ' '))
# No word splitting when no IFS. Hm.
self.assertEqual(
[''], word_eval._IfsSplit('', ''))
def testIfsSplit(self):
self.assertEqual(
['', 'foo', 'bar', ''],
word_eval._IfsSplit('\tfoo bar\n', ' \t\n'))
self.assertEqual(
['\tfoo bar\n'],
word_eval._IfsSplit('\tfoo bar\n', ''))
self.assertEqual(
['a', '', 'd'],
word_eval._IfsSplit('abcd', 'bc'))
def testIfsSplit_Mixed(self):
self.assertEqual(
['a', 'cd'],
word_eval._IfsSplit('abcd', ' b'))
# IFS whitespace rule
self.assertEqual(
['a', 'c'],
word_eval._IfsSplit('abc', 'b '))
self.assertEqual(
['a', 'c'],
word_eval._IfsSplit('a c', 'b '))
self.assertEqual(
['a', '', 'c'],
word_eval._IfsSplit('abbc', 'b '))
self.assertEqual(
['', 'a', '', '', 'cd', ''],
word_eval._IfsSplit('\ta b\tb cd\n', 'b \t\n'))
self.assertEqual(
['', 'a', 'cd', ''],
word_eval._IfsSplit('\tabcd\n', 'b \t\n'))
def testIfsSplit_Mixed2(self):
# Doesn't work yet
return
self.assertEqual(
['a', '', '', 'b'],
word_eval._IfsSplit('a _ _ _ b', '_ '))
def testIfsSplitWhitespaceOnly(self):
# No non-whitespace IFS
self.assertEqual(
['', 'a', 'c', ''],
word_eval._IfsSplit(' a c ', ' '))
self.assertEqual(
['', 'c'],
word_eval._IfsSplit(' c', ' \t\n'))
def testIfsSplitNonWhitespaceOnly(self):
self.assertEqual(
['a', 'c'],
word_eval._IfsSplit('a_c', '_'))
self.assertEqual(
['', ''],
word_eval._IfsSplit('_', '_'))
class WordEvalTest(unittest.TestCase):
pass
if __name__ == '__main__':
unittest.main()
View
@@ -181,6 +181,14 @@ argv "$escaped" "$raw"
# stdout: ['one twox65three', 'one\\ two\\x65three']
# BUG mksh/zsh stdout: ['one twoethree', 'one\\ twoethree']
### read with line continuation reads multiple physical lines
echo -e 'one\\\ntwo\n' > $TMP/readr.txt
read escaped < $TMP/readr.txt
read -r raw < $TMP/readr.txt
argv "$escaped" "$raw"
# stdout: ['onetwo', 'one\\']
# N-I dash stdout: ['-e onetwo', '-e one\\']
### read -r with \n
echo '\nline' > $TMP/readr.txt
read escaped < $TMP/readr.txt
@@ -191,3 +199,44 @@ argv "$escaped" "$raw"
# stdout: ['nline', '\\nline']
# BUG dash/mksh/zsh stdout: ['', '']
### Read with IFS=$'\n'
# The leading spaces are stripped if they appear in IFS.
IFS=$(echo -e '\n')
read var <<EOF
a b c
d e f
EOF
echo "[$var]"
# stdout: [ a b c]
# N-I dash stdout: [a b c]
### Read with IFS=:
# The leading spaces are stripped if they appear in IFS.
# IFS chars are escaped with :.
IFS=:
{ echo ' \\a :b\: c:d\';
echo ' e'
} > $TMP/read-ifs.txt
read a b c d < $TMP/read-ifs.txt
echo "[$a|$b|$c|$d]"
# stdout: [ a |b: c|d e|]
# BUG bash stdout: [ \a |b: c|d e|]
### Read with IFS=''
IFS=''
read x y <<EOF
a b c d
EOF
echo "[$x|$y]"
# stdout: [ a b c d|]
### Read should not respect C escapes.
# bash doesn't respect these, but other shells do. Gah! I think bash
# behavior makes more sense. It only escapes IFS.
echo '\a \b \c \d \e \f \g \h \x65 \145 \i' > $TMP/read-c.txt
read line < $TMP/read-c.txt
echo $line
# stdout-json: "a b c d e f g h x65 145 i\n"
# BUG dash/zsh stdout-json: "\u0007 \u0008\n"
# BUG mksh stdout-json: "\u0007 \u0008 d \u001b \u000c g h e 145 i\n"