/
split.py
334 lines (266 loc) · 9.32 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
split.py - Word Splitting
Nice blog post on the complexity/corner cases/differing intuition of splitting
strings:
https://chriszetter.com/blog/2017/10/29/splitting-strings/
python-dev doesn't want to touch it anymore!
Other possible splitters:
- AwkSplitter -- how does this compare to awk -F?
- RegexSplitter
- CsvSplitter
- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
a pure slice, but neither is IFS splitting because of backslashes.
- Perl?
- does perl have a spilt context?
with SPLIT_REGEX = / digit+ / {
echo $#
echo $len(argv)
echo $1 $2
echo @argv
}
"""
from _devbuild.gen.runtime_asdl import value_e, span_e, value__Str
# Shorter names for state machine enums
from _devbuild.gen.runtime_asdl import emit_e as EMIT
from _devbuild.gen.runtime_asdl import char_kind_e as CH
from _devbuild.gen.runtime_asdl import state_e as ST
from core import util
from core.util import log
from frontend import consts
from mycpp import mylib
from mycpp.mylib import tagswitch
from typing import List, Tuple, Dict, TYPE_CHECKING, cast
if TYPE_CHECKING:
from core.state import Mem
from _devbuild.gen.runtime_asdl import span_t, value_t
Span = Tuple[span_t, int]
DEFAULT_IFS = ' \t\n'
def _SpansToParts(s, spans):
# type: (str, List[Span]) -> List[str]
"""Helper for SplitForWordEval."""
parts = [] # type: List[mylib.BufWriter]
start_index = 0
# If the last span was black, and we get a backslash, set join_next to merge
# two black spans.
join_next = False
last_span_was_black = False
for span_type, end_index in spans:
if span_type == span_e.Black:
if len(parts) and join_next:
parts[-1].write(s[start_index:end_index])
join_next = False
else:
buf = mylib.BufWriter()
buf.write(s[start_index:end_index])
parts.append(buf)
last_span_was_black = True
elif span_type == span_e.Backslash:
if last_span_was_black:
join_next = True
last_span_was_black = False
else:
last_span_was_black = False
start_index = end_index
result = [buf.getvalue() for buf in parts]
return result
class SplitContext(object):
""" A polymorphic interface to field splitting.
It respects a STACK of IFS values, for example:
echo $x # uses default shell IFS
IFS=':' myfunc # new splitter
echo $x # uses default shell IFS again.
"""
def __init__(self, mem):
# type: (Mem) -> None
self.mem = mem
# Split into (ifs_whitespace, ifs_other)
self.splitters = {} # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
def _GetSplitter(self):
# type: () -> IfsSplitter
"""Based on the current stack frame, get the splitter."""
val = self.mem.GetVar('IFS')
UP_val = val
with tagswitch(val) as case:
if case(value_e.Undef):
ifs = DEFAULT_IFS
elif case(value_e.Str):
val = cast(value__Str, UP_val)
ifs = val.s
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
try:
sp = self.splitters[ifs]
except KeyError:
# Figure out what kind of splitter we should instantiate.
ifs_whitespace = mylib.BufWriter()
ifs_other = mylib.BufWriter()
for c in ifs:
if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
ifs_whitespace.write(c)
else:
ifs_other.write(c)
sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
# NOTE: Technically, we could make the key more precise. IFS=$' \t' is
# the same as IFS=$'\t '. But most programs probably don't do that, and
# everything should work in any case.
self.splitters[ifs] = sp
return sp
def GetJoinChar(self):
# type: () -> str
"""
For decaying arrays by joining, eg. "$@" -> $@.
array
"""
# https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
# http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
# "When the expansion occurs within a double-quoted string (see
# Double-Quotes), it shall expand to a single field with the value of
# each parameter separated by the first character of the IFS variable, or
# by a <space> if IFS is unset. If IFS is set to a null string, this is
# not equivalent to unsetting it; its first character does not exist, so
# the parameter values are concatenated."
val = self.mem.GetVar('IFS') # type: value_t
UP_val = val
with tagswitch(val) as case:
if case(value_e.Undef):
return ' '
elif case(value_e.Str):
val = cast(value__Str, UP_val)
if val.s:
return val.s[0]
else:
return ''
else:
# TODO: Raise proper error
raise AssertionError("IFS shouldn't be an array")
def Escape(self, s):
# type: (str) -> str
"""Escape IFS chars."""
sp = self._GetSplitter()
return sp.Escape(s)
def SplitForWordEval(self, s):
# type: (str) -> List[str]
"""Split the string into slices, some of which are marked ignored.
IGNORED can be used for two reasons:
1. The slice is a delimiter.
2. The slice is a a backslash escape.
Example: If you have one\:two, then there are four slices. Only the
backslash one is ignored. In 'one:two', then you have three slices. The
colon is ignored.
Args:
allow_escape, whether \ can escape IFS characters and newlines.
Returns:
Array of (ignored Bool, start_index Int) tuples.
"""
sp = self._GetSplitter()
spans = sp.Split(s, True)
if 0:
for span in spans:
log('SPAN %s', span)
return _SpansToParts(s, spans)
def SplitForRead(self, line, allow_escape):
# type: (str, bool) -> List[Span]
sp = self._GetSplitter()
return sp.Split(line, allow_escape)
class _BaseSplitter(object):
def __init__(self, escape_chars):
# type: (str) -> None
self.escape_chars = escape_chars + '\\' # Backslash is always escaped
def Escape(self, s):
# type: (str) -> str
# Note the characters here are DYNAMIC, unlike other usages of
# BackslashEscape().
return util.BackslashEscape(s, self.escape_chars)
# TODO: Used this when IFS='' or IFS isn't set? This is the fast path for Oil!
class NullSplitter(_BaseSplitter):
def __init__(self, ifs_whitespace):
# type: (str) -> None
_BaseSplitter.__init__(self, ifs_whitespace)
self.ifs_whitespace = ifs_whitespace
def Split(self, s, allow_escape):
# type: (str, bool) -> List[str]
raise NotImplementedError()
class IfsSplitter(_BaseSplitter):
"""Split a string when IFS has non-whitespace characters."""
def __init__(self, ifs_whitespace, ifs_other):
# type: (str, str) -> None
_BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
self.ifs_whitespace = ifs_whitespace
self.ifs_other = ifs_other
def Split(self, s, allow_escape):
# type: (str, bool) -> List[Span]
"""
Args:
s: string to split
allow_escape: False for read -r, this means \ doesn't do anything.
Returns:
List of (runtime.span, end_index) pairs
TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
double-escaping issue.
"""
ws_chars = self.ifs_whitespace
other_chars = self.ifs_other
n = len(s)
spans = [] # type: List[Span] # NOTE: in C, could reserve() this to len(s)
if n == 0:
return spans # empty
# Ad hoc rule from POSIX: ignore leading whitespace.
# "IFS white space shall be ignored at the beginning and end of the input"
# This can't really be handled by the state machine.
i = 0
while i < n and s[i] in self.ifs_whitespace:
i += 1
# Append an ignored span.
if i != 0:
spans.append((span_e.Delim, i))
# String is ONLY whitespace. We want to skip the last span after the
# while loop.
if i == n:
return spans
state = ST.Start
while i < n:
c = s[i]
if c in ws_chars:
ch = CH.DE_White
elif c in other_chars:
ch = CH.DE_Gray
elif allow_escape and c == '\\':
ch = CH.Backslash
else:
ch = CH.Black
new_state, action = consts.TRANSITIONS[state, ch]
if new_state == ST.Invalid:
raise AssertionError(
'Invalid transition from %r with %r' % (state, ch))
if 0:
log('i %d c %r ch %s current: %s next: %s %s',
i, c, ch, state, new_state, action)
if action == EMIT.Part:
spans.append((span_e.Black, i))
elif action == EMIT.Delim:
spans.append((span_e.Delim, i)) # ignored delimiter
elif action == EMIT.Empty:
spans.append((span_e.Delim, i)) # ignored delimiter
spans.append((span_e.Black, i)) # EMPTY part that is NOT ignored
elif action == EMIT.Escape:
spans.append((span_e.Backslash, i)) # \
elif action == EMIT.Nothing:
pass
else:
raise AssertionError()
state = new_state
i += 1
last_action = consts.LAST_SPAN_ACTION[state]
#log('n %d state %s last_action %s', n, state, last_action)
if last_action == EMIT.Part:
spans.append((span_e.Black, n))
elif last_action == EMIT.Delim:
spans.append((span_e.Delim, n))
elif last_action == EMIT.Escape:
spans.append((span_e.Backslash, n))
elif last_action == EMIT.Nothing:
pass
else:
raise AssertionError()
return spans