View
@@ -1,6 +1,17 @@
#!/usr/bin/python
"""
legacy.py
Nice blog post on the complexity/corner cases/difering intuition of splitting
strings:
https://chriszetter.com/blog/2017/10/29/splitting-strings/
python-dev doesn't want to touch it anymore!
Other notes:
- How does this compare to awk -F?
- re.split() ? This appears not to work.
"""
import re
@@ -10,18 +21,19 @@
value_e = runtime.value_e
def GetIfs(mem):
"""
Used for splitting words in Splitter.
"""
val = mem.GetVar('IFS')
if val.tag == value_e.Undef:
return ''
elif val.tag == value_e.Str:
return val.s
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
DEFAULT_IFS = ' \t\n'
class CompletionSplitter:
def __init__(self):
pass
def SplitForWordEval(self, s):
# Return a span that is the whole thing?
# Honestly do I even need this?
return (False, len(s))
# NOTE: Doesn't need to implement SplitForRead
def _Split(s, ifs):
@@ -61,6 +73,7 @@ def IfsSplit(s, ifs):
return _Split(s, ifs)
# Detect IFS whitespace
# TODO: This should be cached. In Mem? Or Splitter?
ifs_whitespace = ''
ifs_other = ''
for c in ifs:
@@ -100,3 +113,334 @@ def IfsSplit(s, ifs):
frags = regex.split(s)
#log('split %r by %r -> frags %s', s, pat, frags)
return frags
# Split operation:
#
# Max to allocate: the length of the string? That's the worst case. Every
# character is a different split.
#
# or use end_index?
#
# word_eval: Makes runtime.fragment out of it. Only takes the parts that are
# not delimiters.
#
# read: assigns it to variables, except for the trailing ones. Don't need
# to split them.
# TODO:
# - Executor holds a splitter. Passes it to word_eval and to the read
# builtin.
#
# Do we have different splitters? Awk splitter might be useful. Regex
# splitter later. CSV splitter? TSV? the TSV one transforms? Beacuse of
# \u0065 in JSON. I guess you can have another kind of slice -- a
# LiteralSlice.
#
#
# with SPLIT_REGEX = / digit+ / {
# echo $#
# echo $len(argv)
# echo $1 $2
# echo @argv
# }
#
# Yes this is nice. How does perl do it?
class RootSplitter(object):
""" A polymorphic interface to field splitting.
It respects a STACK of IFS values, for example:
echo $x # uses default shell IFS
IFS=':' myfunc # new splitter
echo $x # uses default shell IFS again.
"""
def __init__(self, mem):
self.mem = mem
# Split into (ifs_whitespace, ifs_other)
self.splitters = {} # IFS value -> splitter instance
def _GetSplitter(self):
"""Based on the current stack frame, get the splitter."""
val = self.mem.GetVar('IFS')
if val.tag == value_e.Undef:
ifs = ''
elif val.tag == value_e.Str:
ifs = val.s
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
try:
sp = self.splitters[ifs]
except KeyError:
# Figure out what kind of splitter we should instantiate.
ifs_whitespace = ''
ifs_other = ''
for c in ifs:
if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
ifs_whitespace += c
else:
ifs_other += c
if ifs_other:
sp = MixedSplitter(ifs_whitespace, ifs_other)
else:
sp = WhitespaceSplitter(ifs_whitespace)
# NOTE: Technically, we could make the key more precise. IFS=$' \t' is
# the same as IFS=$'\t '. But most programs probably don't do that, and
# everything should work in any case.
self.splitters[ifs] = sp
return sp
def ShouldElide(self):
# HACK for now
sp = self._GetSplitter()
return isinstance(sp, WhitespaceSplitter)
def SplitForWordEval(self, s):
"""Split the string into slices, some of which are marked ignored.
IGNORED can be used for two reasons:
1. The slice is a delimiter.
2. The slice is a a backslash escape.
Example: If you have one\:two, then there are four slices. Only the
backslash one is ignored. In 'one:two', then you have three slices. The
colon is ignored.
Args:
allow_escape, whether \ can escape IFS characters and newlines.
Returns:
Array of (ignored Bool, start_index Int) tuples.
"""
sp = self._GetSplitter()
spans = sp.Split(s, False)
parts = []
start_index = 0
for ignored, end_index in spans:
if not ignored:
parts.append(s[start_index:end_index])
start_index = end_index
return parts
def SplitForRead(self, s, allow_escape):
# Does this give you back the exact number you need?
# Removes ignored ones
sp = WhitespaceSplitter(DEFAULT_IFS)
spans = sp.Split(s, allow_escape)
parts = ['TODO']
return parts
# We detect state changes. WHITE is for whitespace, BLACK is for significant
# chars.
STATE_WHITE, STATE_BLACK = 0, 2
class WhitespaceSplitter(object):
def __init__(self, ifs_whitespace):
self.ifs_whitespace = ifs_whitespace
def Split(self, s, allow_escape):
ws_chars = self.ifs_whitespace
n = len(s)
spans = [] # NOTE: in C, could reserve() this to len(s)
if n == 0:
return spans # empty
state = STATE_WHITE if s[0] in ws_chars else STATE_BLACK
prev_state = state
i = 1
while i < n:
state = STATE_WHITE if s[i] in ws_chars else STATE_BLACK
if state != prev_state:
spans.append((prev_state == STATE_WHITE, i))
prev_state = state
i += 1
spans.append((prev_state == STATE_WHITE, i))
return spans
# IFS splitting is complicated in general. We handle it with three concepts:
#
# - CH_* - Kinds of characters (edge labels)
# - ST_* - States (node labels)
# - Actions: EMIT, etc.
#
# The Split() loop below classifies characters, follows state transitions, and
# emits spans. A span is a (ignored Bool, end_index Int) pair.
# As an example, consider this string:
# 'a _ b'
#
# The character classes are:
#
# a ' ' _ ' ' b
# BLACK DE_WHITE DE_GRAY DE_WHITE BLACK
#
# The states are:
#
# a ' ' _ ' ' b
# BLACK DE_WHITE1 DE_GRAY DE_WHITE2 BLACK
#
# DE_WHITE2 is whitespace that follows a "gray" non-whitespace IFS character.
#
# The spans emitted are:
#
# (part 'a', ignored ' _ ', part 'b')
# SplitForRead() will check if the last two spans are a \ and \\n. Easy.
# Edges are characters. CH_DE_ is the delimiter prefix. WHITE is for
# whitespace; GRAY is for other IFS chars; BLACK is for significant
# characters.
CH_DE_WHITE, CH_DE_GRAY, CH_BLACK, CH_BACKSLASH = range(4)
# Nodes are states
ST_START, ST_DE_WHITE1, ST_DE_GRAY, ST_DE_WHITE2, ST_BLACK, ST_BACKSLASH = range(6)
# Actions control what spans to emit.
EMIT_PART, EMIT_DE, EMIT_EMPTY, EMIT_ESCAPE, NO_EMIT = range(5)
TRANSITIONS = {
(ST_START, CH_DE_WHITE): (ST_DE_WHITE1, NO_EMIT), # ' '
(ST_START, CH_DE_GRAY): (ST_DE_GRAY, EMIT_EMPTY), # '_'
(ST_START, CH_BLACK): (ST_BLACK, NO_EMIT), # 'a'
(ST_START, CH_BACKSLASH): (ST_BACKSLASH, NO_EMIT), # '\'
(ST_DE_WHITE1, CH_DE_WHITE): (ST_DE_WHITE1, NO_EMIT), # ' '
(ST_DE_WHITE1, CH_DE_GRAY): (ST_DE_GRAY, NO_EMIT), # ' _'
(ST_DE_WHITE1, CH_BLACK): (ST_BLACK, EMIT_DE), # ' a'
(ST_DE_WHITE1, CH_BACKSLASH): (ST_BACKSLASH, EMIT_DE), # ' \'
(ST_DE_GRAY, CH_DE_WHITE): (ST_DE_WHITE2, NO_EMIT), # '_ '
(ST_DE_GRAY, CH_DE_GRAY): (ST_DE_GRAY, EMIT_EMPTY), # '__'
(ST_DE_GRAY, CH_BLACK): (ST_BLACK, EMIT_DE), # '_a'
(ST_DE_GRAY, CH_BACKSLASH): (ST_BLACK, EMIT_DE), # '_\'
(ST_DE_WHITE2, CH_DE_WHITE): (ST_DE_WHITE2, NO_EMIT), # '_ '
(ST_DE_WHITE2, CH_DE_GRAY): (ST_DE_GRAY, EMIT_EMPTY), # '_ _'
(ST_DE_WHITE2, CH_BLACK): (ST_BLACK, EMIT_DE), # '_ a'
(ST_DE_WHITE2, CH_BACKSLASH): (ST_BACKSLASH, EMIT_DE), # '_ \'
(ST_BLACK, CH_DE_WHITE): (ST_DE_WHITE1, EMIT_PART), # 'a '
(ST_BLACK, CH_DE_GRAY): (ST_DE_GRAY, EMIT_PART), # 'a_'
(ST_BLACK, CH_BLACK): (ST_BLACK, NO_EMIT), # 'aa'
(ST_BLACK, CH_BACKSLASH): (ST_BACKSLASH, EMIT_PART), # 'a\'
# Here we emit an ignored \ and the second character as well.
# We're emitting TWO spans here; we don't wait until the subsequent
# character. That is OK.
#
# Problem: if '\ ' is the last one, we don't want to emit a trailing span?
# In all other cases we do.
(ST_BACKSLASH, CH_DE_WHITE): (ST_BLACK, EMIT_ESCAPE), # '\ '
(ST_BACKSLASH, CH_DE_GRAY): (ST_BLACK, EMIT_ESCAPE), # '\_'
(ST_BACKSLASH, CH_BLACK): (ST_BLACK, EMIT_ESCAPE), # '\a'
(ST_BACKSLASH, CH_BACKSLASH): (ST_BACKSLASH, EMIT_ESCAPE), # '\\'
}
class MixedSplitter(object):
"""Split a string when IFS has non-whitespace characters."""
def __init__(self, ifs_whitespace, ifs_other):
self.ifs_whitespace = ifs_whitespace
self.ifs_other = ifs_other
def Split(self, s, allow_escape):
ws_chars = self.ifs_whitespace
other_chars = self.ifs_other
n = len(s)
spans = [] # NOTE: in C, could reserve() this to len(s)
if n == 0:
return spans # empty
# Ad hoc rule from POSIX: ignore leading whitespace.
# "IFS white space shall be ignored at the beginning and end of the input"
# This can't really be handled by the state machine.
i = 0
while i < n and s[i] in self.ifs_whitespace:
i += 1
# Append an ignored span.
if i != 0:
spans.append((True, i))
# String is ONLY whitespace. We want to skip the last span after the
# while loop.
if i == n:
return spans
state = ST_START
while i < n:
c = s[i]
if c in ws_chars:
ch = CH_DE_WHITE
elif c in other_chars:
ch = CH_DE_GRAY
else:
ch = CH_BLACK
new_state, action = TRANSITIONS[state, ch]
#from core.util import log
#log('i %d c %r ch %s state %s new_state %s action %s', i, c, ch, state, new_state, action)
if action == EMIT_PART:
spans.append((False, i))
elif action == EMIT_DE:
spans.append((True, i)) # ignored delimiter
elif action == EMIT_EMPTY:
spans.append((True, i)) # ignored delimiter
spans.append((False, i)) # EMPTY part that is NOT ignored
else:
pass # Emit nothing
state = new_state
i += 1
# Last span
ignored = state in (ST_DE_WHITE1, ST_DE_GRAY, ST_DE_WHITE2)
spans.append((ignored, n))
return spans
# self.splitter = RootSplitter()
# SplitManager
# Has the cache from IFS -> splitter
# Split(s, allow_escape)
#
# _DefaultIfsSplitter -- \t\n\n
# _WhitespaceIfsSplitter
# _OtherIfsSplitter
# _MixedIfsSplitter -- ifs and other
# Split(s, allow_escape)
#
# RegexSplitter
# CsvSplitter (TSV2Splitter maybe)
# AwkSplitter
#
# Any other kind of tokenizing? This is based on lines. So TSV2 does fit in.
View
@@ -8,8 +8,100 @@
from core import legacy # module under test
def _RunSplitCases(test, sp, cases):
for expected_parts, s, allow_escape in cases:
spans = sp.Split(s, allow_escape)
print('%r: %s' % (s, spans))
parts = []
start_index = 0
for ignored, end_index in spans:
if not ignored:
parts.append(s[start_index:end_index])
start_index = end_index
test.assertEqual(expected_parts, parts,
'%r: %s != %s' % (s, expected_parts, parts))
class SplitTest(unittest.TestCase):
def testDefaultIfs(self):
CASES = [
([], '', True),
(['a'], 'a', True),
(['a'], ' a ', True),
(['ab'], '\tab\n', True),
(['a', 'b'], 'a b\n', True),
]
sp = legacy.WhitespaceSplitter(legacy.DEFAULT_IFS)
_RunSplitCases(self, sp, CASES)
def testMixedIfs(self):
CASES = [
([], '', True),
(['a', 'b'], 'a_b', True),
(['a', 'b'], ' a b ', True),
(['a', 'b'], 'a _ b', True),
(['a', 'b'], ' a _ b ', True),
(['a', '', 'b'], 'a _ _ b', True),
(['a', '', 'b'], 'a __ b', True),
(['a', '', '', 'b'], 'a _ _ _ b', True),
(['a'], ' a _ ', True),
# Contrast with the case above.
# NOTES:
# - This cases REQUIRES ignoring leading whitespace. The state machine
# can't handle it.
# - We get three spans with index 1 because of the initial rule to
# ignore whitespace, and then EMIT_EMPTY. Seems harmless for now?
(['', 'a'], ' _ a _ ', True),
]
# IFS='_ '
sp = legacy.MixedSplitter(' ', '_')
_RunSplitCases(self, sp, CASES)
def testWhitespaceOnly(self):
CASES = [
([], '', True),
([], '\t', True),
(['a'], 'a\t', True),
(['a', 'b'], '\t\ta\tb\t', True),
]
# IFS='_ '
sp = legacy.MixedSplitter('\t', '')
_RunSplitCases(self, sp, CASES)
def testOtherOnly(self):
CASES = [
([], '', True),
([''], '_', True),
(['a'], 'a_', True),
(['', '', 'a', 'b'], '__a_b_', True),
]
# IFS='_ '
sp = legacy.MixedSplitter('', '_')
_RunSplitCases(self, sp, CASES)
def testTwoOther(self):
CASES = [
(['a', '', 'b', '', '', 'c', 'd'], 'a__b---c_d', True)
]
# IFS='_ '
sp = legacy.MixedSplitter('', '_-')
_RunSplitCases(self, sp, CASES)
class OldSplitTest(unittest.TestCase):
def testIfsSplitEmpty(self):
self.assertEqual(
[''], legacy.IfsSplit('', ' \t\n'))
@@ -35,58 +127,6 @@ def testIfsSplit(self):
['a', '', 'd'],
legacy.IfsSplit('abcd', 'bc'))
def testIfsSplit_Mixed(self):
self.assertEqual(
['a', 'cd'],
legacy.IfsSplit('abcd', ' b'))
# IFS whitespace rule
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('abc', 'b '))
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('a c', 'b '))
self.assertEqual(
['a', '', 'c'],
legacy.IfsSplit('abbc', 'b '))
self.assertEqual(
['', 'a', '', '', 'cd', ''],
legacy.IfsSplit('\ta b\tb cd\n', 'b \t\n'))
self.assertEqual(
['', 'a', 'cd', ''],
legacy.IfsSplit('\tabcd\n', 'b \t\n'))
def testIfsSplit_Mixed2(self):
# Doesn't work yet
return
self.assertEqual(
['a', '', '', 'b'],
legacy.IfsSplit('a _ _ _ b', '_ '))
def testIfsSplitWhitespaceOnly(self):
# No non-whitespace IFS
self.assertEqual(
['', 'a', 'c', ''],
legacy.IfsSplit(' a c ', ' '))
self.assertEqual(
['', 'c'],
legacy.IfsSplit(' c', ' \t\n'))
def testIfsSplitNonWhitespaceOnly(self):
self.assertEqual(
['a', 'c'],
legacy.IfsSplit('a_c', '_'))
self.assertEqual(
['', ''],
legacy.IfsSplit('_', '_'))
if __name__ == '__main__':
unittest.main()
View
@@ -14,6 +14,7 @@
import os
from core import args
from core import legacy
from core import runtime
from core import util
from core.id_kind import Id
@@ -346,7 +347,7 @@ def _InitDefaults(self):
# Default value; user may unset it.
# $ echo -n "$IFS" | python -c 'import sys;print repr(sys.stdin.read())'
# ' \t\n'
SetGlobalString(self, 'IFS', ' \t\n')
SetGlobalString(self, 'IFS', legacy.DEFAULT_IFS)
SetGlobalString(self, 'PWD', os.getcwd())
# NOTE: Should we put these in a namespace for Oil?
View
@@ -43,6 +43,7 @@ def _ValueToPartValue(val, quoted):
raise AssertionError
# TODO: Move to RootSplitter?
def _GetJoinChar(mem):
"""
For decaying arrays by joining, eg. "$@" -> $@.
@@ -66,30 +67,58 @@ def _GetJoinChar(mem):
raise AssertionError("IFS shouldn't be an array")
def _SplitPartsIntoFragments(part_vals, ifs):
# Problem:
#
# argv.py 1${undefined:-"2 3" "4 5"}6
#
# This is a CompoundWord (DQ '2 3')(LiteralPart ' ')(DQ '4 5')
#
# echo [$FOO]
def _SplitPartsIntoFragments(part_vals, splitter):
"""
part_value[] -> part_value[]
Depends on no_glob
Args:
part_vals: array of runtime.part_value
splitter: RootSplitter() instance
Returns:
An array of arrays of runtime.fragment()
"""
# Every part yields a single fragment array.
frag_arrays = []
for p in part_vals:
#log('p %s', p)
if p.tag == part_value_e.StringPartValue:
#log("SPLITTING %s with ifs %r", p, ifs)
if p.do_split_elide:
frags = legacy.IfsSplit(p.s, ifs)
res = [runtime.fragment(f, True, p.do_glob) for f in frags]
split_strs = splitter.SplitForWordEval(p.s)
#log('split_strs %s', split_strs)
# NOTE:
# 'echo [$FOO]' passes with this
# argv ${x:-"1 2" "3 4"} fails
if not split_strs:
continue
#frags = legacy.IfsSplit(p.s, ifs)
frags = [runtime.fragment(f, True, p.do_glob) for f in split_strs]
#log("RES %s", res)
else:
# Example: 'a b' and "a b" don't need to be split.
res = [runtime.fragment(p.s, p.do_split_elide, p.do_glob)]
frags = [runtime.fragment(p.s, p.do_split_elide, p.do_glob)]
elif p.tag == part_value_e.ArrayPartValue:
# "$@" and "${a[@]}" don't need to be split or globbed
res = [runtime.fragment(f, False, False) for f in p.strs]
frags = [runtime.fragment(f, False, False) for f in p.strs]
else:
raise AssertionError(p.tag)
frag_arrays.append(res)
#log("frags %s\n", frags)
frag_arrays.append(frags)
#log("==> frag_arrays %s\n", frag_arrays)
return frag_arrays
@@ -120,6 +149,9 @@ def _Reframe(frag_arrays):
def _JoinElideEscape(frag_arrays, elide_empty, glob_escape):
"""Join parts without globbing or eliding.
Args:
frag_arrays: Array of array of runtime.fragment
Returns:
arg_value[]
"""
@@ -859,11 +891,12 @@ class _WordEvaluator:
EvalWordSequence
Error
"""
def __init__(self, mem, exec_opts, part_ev):
def __init__(self, mem, exec_opts, part_ev, splitter):
self.mem = mem
self.exec_opts = exec_opts
self.part_ev = part_ev
self.splitter = splitter
self.globber = glob_.Globber(exec_opts)
def _EvalParts(self, word, quoted=False):
@@ -983,20 +1016,16 @@ def _EvalWordAndReframe(self, word):
"""
part_vals = self._EvalParts(word)
#log('part_vals after _EvalParts %s', part_vals)
ifs = legacy.GetIfs(self.mem)
frag_arrays = _SplitPartsIntoFragments(part_vals, ifs)
#ifs = legacy.GetIfs(self.mem)
frag_arrays = _SplitPartsIntoFragments(part_vals, self.splitter)
#log('Fragments after split: %s', frag_arrays)
frag_arrays = _Reframe(frag_arrays)
#log('Fragments after reframe: %s', frag_arrays)
glob_escape = not self.exec_opts.noglob
# NOTE: Empirically, elision depends on IFS. I don't see it in the POSIX
# spec though. This may need to be revised to have ' \t\n'.
elide_empty = True
for c in ifs:
if c not in ' \t\n':
elide_empty = False
# Empirically, elision depends on IFS. I don't see it in the POSIX spec?
elide_empty = self.splitter.ShouldElide()
args = _JoinElideEscape(frag_arrays, elide_empty, glob_escape)
#log('After _JoinElideEscape %s', args)
@@ -1075,9 +1104,9 @@ def _EvalProcessSub(self, node, id_):
class NormalWordEvaluator(_WordEvaluator):
def __init__(self, mem, exec_opts, ex):
def __init__(self, mem, exec_opts, splitter, ex):
part_ev = _NormalPartEvaluator(mem, exec_opts, ex, self)
_WordEvaluator.__init__(self, mem, exec_opts, part_ev)
_WordEvaluator.__init__(self, mem, exec_opts, part_ev, splitter)
class _CompletionPartEvaluator(_WordPartEvaluator):
@@ -1105,7 +1134,11 @@ def _EvalProcessSub(self, node, id_):
class CompletionWordEvaluator(_WordEvaluator):
"""
Difference from NormalWordEvaluator: No access to executor! But they both
have a splitter.
"""
def __init__(self, mem, exec_opts):
def __init__(self, mem, exec_opts, splitter):
part_ev = _CompletionPartEvaluator(mem, exec_opts, self)
_WordEvaluator.__init__(self, mem, exec_opts, part_ev)
_WordEvaluator.__init__(self, mem, exec_opts, part_ev, splitter)
View
@@ -78,6 +78,18 @@ s1='_a_b_'
argv.py $s1
# stdout: ['', 'a', 'b']
### Leading ' ' vs leading ' _ '
# This behavior is weird, but all shells agree.
IFS='_ '
s1='_ a b _ '
s2=' a b _ '
argv.py $s1
argv.py $s2
## STDOUT:
['', 'a', 'b']
['a', 'b']
## END
### Multiple non-whitespace IFS chars.
IFS=_-
s1='a__b---c_d'