-
-
Notifications
You must be signed in to change notification settings - Fork 147
/
word_parse.py
1272 lines (1019 loc) · 41.9 KB
/
word_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2016 Andy Chu. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
"""
word_parse.py - Parse the shell word language.
Hairy example:
hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
Substitutions can be nested, but which inner subs are allowed depends on the
outer sub.
lex_mode_e.ShCommand (_ReadLeftParts)
All subs and quotes are allowed:
$v ${v} $() `` $(()) '' "" $'' $"" <() >()
lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
Var, Command, Arith, but no quotes.
$v ${v} $() `` $(())
No process substitution.
lex_mode_e.Arith
Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
need those for associative array indexing.
lex_mode_e.VSub_ArgUnquoted
Like UNQUOTED, everything is allowed (even process substitutions), but we
stop at }, and space is SIGNIFICANT.
Example: ${a:- b }
${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
lex_mode_e.VSub_ArgDQ
In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
"${x:-"default"}".
In contrast, VS_ARG_UNQ respects single quotes and process substitution.
It's weird that double quotes are allowed. Space is also significant here,
e.g. "${x:-a "b"}".
"""
from _devbuild.gen import grammar_nt
from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t
from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
from _devbuild.gen.syntax_asdl import (
token, arith_expr_t, bracket_op_t,
suffix_op_t, suffix_op__Slice, suffix_op__PatSub,
word_t, word__CompoundWord, word__TokenWord,
word_part, word_part_t,
word_part__ArrayLiteralPart, word_part__LiteralPart,
word_part__BracedVarSub, word_part__SingleQuotedPart,
word_part__ArithSubPart, word_part__DoubleQuotedPart,
word_part__CommandSubPart, word_part__ExtGlobPart,
command, command_t, command__ForExpr,
suffix_op, bracket_op,
source,
)
# TODO: rename word -> osh_word in syntax.asdl
from _devbuild.gen.syntax_asdl import word as osh_word
from core import meta
from core.util import p_die
from frontend import reader
from frontend import tdop
from osh import arith_parse
from osh import braces
from osh import word
from typing import List, Optional, Tuple, cast
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from frontend.lexer import Lexer
from frontend.parse_lib import ParseContext
from frontend.reader import _Reader
class WordParser(object):
def __init__(self, parse_ctx, lexer, line_reader, lex_mode=lex_mode_e.ShCommand):
# type: (ParseContext, Lexer, _Reader, lex_mode_t) -> None
self.parse_ctx = parse_ctx
self.lexer = lexer
self.line_reader = line_reader
self.Reset(lex_mode=lex_mode)
def Reset(self, lex_mode=lex_mode_e.ShCommand):
# type: (lex_mode_t) -> None
"""Called by interactive loop."""
# For _Peek()
self.cur_token = None # type: token
self.token_kind = Kind.Undefined
self.token_type = Id.Undefined_Tok
self.next_lex_mode = lex_mode
# For newline. TODO: I think we can do this iteratively, without member
# state.
self.cursor = None # type: word_t
self.cursor_was_newline = False
self.buffered_word = None # type: word_t
def _Peek(self):
# type: () -> token
"""Helper method."""
if self.next_lex_mode is not None:
self.cur_token = self.lexer.Read(self.next_lex_mode)
self.token_type = self.cur_token.id
self.token_kind = meta.LookupKind(self.token_type)
self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
self.next_lex_mode = None
return self.cur_token
def _Next(self, lex_mode):
# type: (lex_mode_t) -> None
"""Set the next lex state, but don't actually read a token.
We need this for proper interactive parsing.
"""
self.next_lex_mode = lex_mode
def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
empty_ok=True):
# type: (lex_mode_t, Id_t, bool) -> word_t
"""
Args:
empty_ok: Whether EmptyWord can be returned
"""
# NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
# valid, even when unquoted.
self._Next(arg_lex_mode)
self._Peek()
w = self._ReadCompoundWord(lex_mode=arg_lex_mode, eof_type=eof_type,
empty_ok=empty_ok)
# If the CompoundWord has no parts, and we're in a double-quoted VarSub
# arg, and empty_ok, then return EmptyWord. This is so it can evaluate to
# the empty string and not get elided.
#
# Examples:
# - "${s:-}", "${s/%pat/}"
# It's similar to LooksLikeAssignment where we turn x= into x=''. And it
# has the same potential problem of not having spids.
#
# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
# return a CompoundWord with no parts, which is explicitly checked with a
# custom error message.
if not w.parts and arg_lex_mode == lex_mode_e.VSub_ArgDQ and empty_ok:
return osh_word.EmptyWord()
return w
def _ReadSliceVarOp(self):
# type: () -> suffix_op__Slice
""" VarOf ':' ArithExpr (':' ArithExpr )? """
self._Next(lex_mode_e.Arith)
self._Peek()
if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
begin = None # no beginning specified
else:
begin = self._ReadArithExpr()
if self.token_type == Id.Arith_RBrace:
return suffix_op.Slice(begin, None) # No length specified
# Id.Arith_Colon is a pun for Id.VOp2_Colon
if self.token_type == Id.Arith_Colon:
self._Next(lex_mode_e.Arith)
length = self._ReadArithExpr()
return suffix_op.Slice(begin, length)
p_die("Unexpected token in slice: %r", self.cur_token.val,
token=self.cur_token)
def _ReadPatSubVarOp(self, lex_mode):
# type: (lex_mode_t) -> suffix_op__PatSub
"""
Match = ('/' | '#' | '%') WORD
VarSub = ...
| VarOf '/' Match '/' WORD
"""
pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
assert isinstance(pat, word__CompoundWord) # Because empty_ok=False
if len(pat.parts) == 1:
ok, s, quoted = word.StaticEval(pat)
if ok and s == '/' and not quoted: # Looks like ${a////c}, read again
self._Next(lex_mode)
self._Peek()
p = word_part.LiteralPart(self.cur_token)
pat.parts.append(p)
if len(pat.parts) == 0:
p_die('Pattern in ${x/pat/replace} must not be empty',
token=self.cur_token)
replace_mode = Id.Undefined_Tok
# Check for / # % modifier on pattern.
first_part = pat.parts[0]
if isinstance(first_part, word_part__LiteralPart):
lit_id = first_part.token.id
if lit_id in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
pat.parts.pop(0)
replace_mode = lit_id
# NOTE: If there is a modifier, the pattern can be empty, e.g.
# ${s/#/foo} and ${a/%/foo}.
if self.token_type == Id.Right_DollarBrace:
# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
return suffix_op.PatSub(pat, None, replace_mode)
if self.token_type == Id.Lit_Slash:
replace = self._ReadVarOpArg(lex_mode) # do not stop at /
self._Peek()
if self.token_type != Id.Right_DollarBrace:
# NOTE: I think this never happens.
# We're either in the VS_ARG_UNQ or VS_ARG_DQ lex state, and everything
# there is Lit_ or Left_, except for }.
p_die("Expected } after replacement string, got %s", self.cur_token,
token=self.cur_token)
return suffix_op.PatSub(pat, replace, replace_mode)
# Happens with ${x//} and ${x///foo}, see test/parse-errors.sh
p_die("Expected } after pat sub, got %r", self.cur_token.val,
token=self.cur_token)
def _ReadSubscript(self):
# type: () -> bracket_op_t
""" Subscript = '[' ('@' | '*' | ArithExpr) ']'
"""
# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
# expression.
t2 = self.lexer.LookAhead(lex_mode_e.Arith)
if t2.id in (Id.Lit_At, Id.Arith_Star):
op = bracket_op.WholeArray(t2.id) # type: bracket_op_t
self._Next(lex_mode_e.Arith) # skip past [
self._Peek()
self._Next(lex_mode_e.Arith) # skip past @
self._Peek()
else:
self._Next(lex_mode_e.Arith) # skip past [
anode = self._ReadArithExpr()
op = bracket_op.ArrayIndex(anode)
if self.token_type != Id.Arith_RBracket: # Should be looking at ]
p_die('Expected ] after subscript, got %r', self.cur_token.val,
token=self.cur_token)
self._Next(lex_mode_e.VSub_2) # skip past ]
self._Peek() # Needed to be in the same spot as no subscript
return op
def _ParseVarOf(self):
# type: () -> word_part__BracedVarSub
"""
VarOf = NAME Subscript?
| NUMBER # no subscript allowed, none of these are arrays
# ${@[1]} doesn't work, even though slicing does
| VarSymbol
"""
self._Peek()
name_token = self.cur_token
self._Next(lex_mode_e.VSub_2)
self._Peek() # Check for []
if self.token_type == Id.VOp2_LBracket:
bracket_op = self._ReadSubscript()
else:
bracket_op = None
part = word_part.BracedVarSub(name_token)
part.bracket_op = bracket_op
return part
def _ParseVarExpr(self, arg_lex_mode):
# type: (lex_mode_t) -> word_part__BracedVarSub
"""
Start parsing at the op -- we already skipped past the name.
"""
part = self._ParseVarOf()
self._Peek()
if self.token_type == Id.Right_DollarBrace:
return part # no ops
op_kind = self.token_kind
if op_kind == Kind.VTest:
op_id = self.token_type
arg_word = self._ReadVarOpArg(arg_lex_mode)
if self.token_type != Id.Right_DollarBrace:
p_die('Unexpected token (after VTest): %r', self.cur_token.val,
token=self.cur_token)
part.suffix_op = suffix_op.StringUnary(op_id, arg_word)
elif op_kind == Kind.VOp0:
op_id = self.token_type
part.suffix_op = suffix_op.StringNullary(op_id)
self._Next(lex_mode_e.VSub_2) # Expecting }
self._Peek()
elif op_kind == Kind.VOp1:
op_id = self.token_type
arg_word = self._ReadVarOpArg(arg_lex_mode)
if self.token_type != Id.Right_DollarBrace:
p_die('Unexpected token (after VOp1): %r', self.cur_token.val,
token=self.cur_token)
part.suffix_op = suffix_op.StringUnary(op_id, arg_word)
elif op_kind == Kind.VOp2:
if self.token_type == Id.VOp2_Slash:
op_spid = self.cur_token.span_id # for attributing error to /
# TODO: op_temp is only necessary for MyPy. It can be removed when
# 'spids' are put on the base class suffix_op_t.
op_temp = self._ReadPatSubVarOp(arg_lex_mode)
op_temp.spids.append(op_spid)
op = cast(suffix_op_t, op_temp) # for MyPy
# Checked by the method above
assert self.token_type == Id.Right_DollarBrace, self.cur_token
elif self.token_type == Id.VOp2_Colon:
op = self._ReadSliceVarOp()
# NOTE: } in arithmetic mode.
if self.token_type != Id.Arith_RBrace:
# Token seems off; doesn't point to X in # ${a:1:2 X
p_die('Unexpected token after slice: %r', self.cur_token.val,
token=self.cur_token)
else:
p_die('Unexpected token %r', self.cur_token.val, token=self.cur_token)
part.suffix_op = op
# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
# mode. It's redundantly checked above.
if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
# ${a.} or ${!a.}
p_die('Expected } after var sub, got %r', self.cur_token.val,
token=self.cur_token)
# Now look for ops
return part
def _ReadBracedBracedVarSub(self, d_quoted=False):
# type: (bool) -> word_part__BracedVarSub
"""For the ${} expression language.
NAME = [a-zA-Z_][a-zA-Z0-9_]*
NUMBER = [0-9]+ # ${10}, ${11}, ...
Subscript = '[' ('@' | '*' | ArithExpr) ']'
VarSymbol = '!' | '@' | '#' | ...
VarOf = NAME Subscript?
| NUMBER # no subscript allowed, none of these are arrays
# ${@[1]} doesn't work, even though slicing does
| VarSymbol
TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
STRIP_OP = '#' | '##' | '%' | '%%'
CASE_OP = ',' | ',,' | '^' | '^^'
UnaryOp = TEST_OP | STRIP_OP | CASE_OP | ...
Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
VarExpr = VarOf
| VarOf UnaryOp WORD
| VarOf ':' ArithExpr (':' ArithExpr )?
| VarOf '/' Match '/' WORD
LengthExpr = '#' VarOf # can't apply operators after length
RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
# ${!ref[0]} vs ${!keys[@]} resolved later
PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
VarSub = LengthExpr
| RefOrKeys
| PrefixQuery
| VarExpr
NOTES:
- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
slicing ${a:x+1:y+2}
- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
- @ and * are technically arithmetic expressions in this implementation
- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
it's also vectorized.
Strictness over bash:
echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
grammar
! and # prefixes can't be composed, even though named refs can be composed
with other operators
'#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
prefix, and it can also be a literal part of WORD.
From the parser's point of view, the prefix # can't be combined with
UnaryOp/slicing/matching, and the ! can. However
${a[@]:1:2} is not allowed
${#a[@]:1:2} is allowed, but gives the wrong answer
"""
left_spid = self.cur_token.span_id
if d_quoted:
arg_lex_mode = lex_mode_e.VSub_ArgDQ
else:
arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
self._Next(lex_mode_e.VSub_1)
self._Peek()
ty = self.token_type
if ty == Id.VSub_Pound:
# Disambiguate
t = self.lexer.LookAhead(lex_mode_e.VSub_1)
if t.id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
# e.g. a name, '#' is the prefix
self._Next(lex_mode_e.VSub_1)
part = self._ParseVarOf()
self._Peek()
if self.token_type != Id.Right_DollarBrace:
p_die("Expected } after length expression, got %r",
self.cur_token.val, token=self.cur_token)
part.prefix_op = Id.VSub_Pound # length
else: # not a prefix, '#' is the variable
part = self._ParseVarExpr(arg_lex_mode)
elif ty == Id.VSub_Bang:
t = self.lexer.LookAhead(lex_mode_e.VSub_1)
if t.id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
# e.g. a name, '!' is the prefix
# ${!a} -- this is a ref
# ${!3} -- this is ref
# ${!a[1]} -- this is a ref
# ${!a[@]} -- this is a keys
# No lookahead -- do it in a second step, or at runtime
self._Next(lex_mode_e.VSub_1)
part = self._ParseVarExpr(arg_lex_mode)
part.prefix_op = Id.VSub_Bang
else: # not a prefix, '!' is the variable
part = self._ParseVarExpr(arg_lex_mode)
# VS_NAME, VS_NUMBER, symbol that isn't # or !
elif self.token_kind == Kind.VSub:
part = self._ParseVarExpr(arg_lex_mode)
else:
# e.g. ${^}
p_die('Unexpected token %r', self.cur_token.val, token=self.cur_token)
part.spids.append(left_spid)
# Does this work?
right_spid = self.cur_token.span_id
part.spids.append(right_spid)
return part
def _ReadSingleQuotedPart(self, lex_mode):
# type: (lex_mode_t) -> word_part__SingleQuotedPart
left = self.cur_token
tokens = []
done = False
while not done:
self._Next(lex_mode)
self._Peek()
# Kind.Char emitted in DOLLAR_SQ state
if self.token_kind in (Kind.Lit, Kind.Char):
tokens.append(self.cur_token)
elif self.token_kind == Kind.Eof:
p_die('Unexpected EOF in single-quoted string that began here',
token=left)
elif self.token_kind == Kind.Right:
done = True # assume Id.Right_SingleQuote
else:
raise AssertionError(
'Unhandled token in single-quoted part %s (%s)' %
(self.cur_token, self.token_kind))
node = word_part.SingleQuotedPart(left, tokens)
node.spids.append(left.span_id) # left '
node.spids.append(self.cur_token.span_id) # right '
return node
def _ReadDoubleQuotedLeftParts(self):
# type: () -> word_part_t
"""Read substitution parts in a double quoted context."""
if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
return self._ReadCommandSubPart(self.token_type)
if self.token_type == Id.Left_DollarBrace:
return self._ReadBracedBracedVarSub(d_quoted=True)
if self.token_type == Id.Left_DollarDParen:
return self._ReadArithSubPart()
if self.token_type == Id.Left_DollarBracket:
return self._ReadArithSub2Part()
raise AssertionError(self.cur_token)
def _ReadLeftParts(self):
# type: () -> word_part_t
"""Read substitutions and quoted strings (for the OUTER context)."""
if self.token_type == Id.Left_DoubleQuote:
return self._ReadDoubleQuotedPart()
if self.token_type == Id.Left_DollarDoubleQuote:
# NOTE: $"" is treated as "" for now. Does it make sense to add the
# token to the part?
return self._ReadDoubleQuotedPart()
if self.token_type == Id.Left_SingleQuote:
return self._ReadSingleQuotedPart(lex_mode_e.SQ)
if self.token_type == Id.Left_DollarSingleQuote:
return self._ReadSingleQuotedPart(lex_mode_e.DollarSQ)
if self.token_type in (
Id.Left_DollarParen, Id.Left_Backtick, Id.Left_ProcSubIn,
Id.Left_ProcSubOut):
return self._ReadCommandSubPart(self.token_type)
if self.token_type == Id.Left_DollarBrace:
return self._ReadBracedBracedVarSub(d_quoted=False)
if self.token_type == Id.Left_DollarDParen:
return self._ReadArithSubPart()
if self.token_type == Id.Left_DollarBracket:
return self._ReadArithSub2Part()
raise AssertionError('%s not handled' % self.cur_token)
def _ReadExtGlobPart(self):
# type: () -> word_part__ExtGlobPart
"""
Grammar:
Item = CompoundWord | EPSILON # important: @(foo|) is allowed
LEFT = '@(' | '*(' | '+(' | '?(' | '!('
RIGHT = ')'
ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
CompoundWord includes ExtGlobPart
"""
left_token = self.cur_token
arms = [] # type: List[word_t]
spids = []
spids.append(left_token.span_id)
self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
self._Next(lex_mode_e.ExtGlob) # advance past LEFT
read_word = False # did we just a read a word? To handle @(||).
while True:
self._Peek()
if self.token_type == Id.Right_ExtGlob:
if not read_word:
arms.append(osh_word.CompoundWord())
spids.append(self.cur_token.span_id)
break
elif self.token_type == Id.Op_Pipe:
if not read_word:
arms.append(osh_word.CompoundWord())
read_word = False
self._Next(lex_mode_e.ExtGlob)
# lex mode EXTGLOB should only produce these 4 kinds of tokens
elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
w = self._ReadCompoundWord(lex_mode=lex_mode_e.ExtGlob)
arms.append(w)
read_word = True
elif self.token_kind == Kind.Eof:
p_die('Unexpected EOF reading extended glob that began here',
token=left_token)
else:
raise AssertionError('Unexpected token %r' % self.cur_token)
part = word_part.ExtGlobPart(left_token, arms)
part.spids.extend(spids)
return part
def _ReadLikeDQ(self, left_dq_token, out_parts):
# type: (Optional[token], List[word_part_t]) -> None
"""
Args:
left_dq_token: A token if we are reading a double quoted part, or None if
we're reading a here doc.
out_parts: list of word_part to append to
"""
done = False
while not done:
self._Next(lex_mode_e.DQ)
self._Peek()
if self.token_kind == Kind.Lit:
if self.token_type == Id.Lit_EscapedChar:
part = word_part.EscapedLiteralPart(self.cur_token) # type: word_part_t
else:
part = word_part.LiteralPart(self.cur_token)
out_parts.append(part)
elif self.token_kind == Kind.Left:
part = self._ReadDoubleQuotedLeftParts()
out_parts.append(part)
elif self.token_kind == Kind.VSub:
part = word_part.SimpleVarSub(self.cur_token)
out_parts.append(part)
elif self.token_kind == Kind.Right:
assert self.token_type == Id.Right_DoubleQuote, self.token_type
if left_dq_token:
done = True
else:
# In a here doc, the right quote is literal!
out_parts.append(word_part.LiteralPart(self.cur_token))
elif self.token_kind == Kind.Eof:
if left_dq_token:
p_die('Unexpected EOF reading double-quoted string that began here',
token=left_dq_token)
else: # here docs will have an EOF in their token stream
done = True
else:
raise AssertionError(self.cur_token)
# Return nothing, since we appended to 'out_parts'
def _ReadDoubleQuotedPart(self):
# type: () -> word_part__DoubleQuotedPart
"""
Args:
eof_type: for stopping at }, Id.Lit_RBrace
here_doc: Whether we are reading in a here doc context
Also ${foo%%a b c} # treat this as double quoted. until you hit
"""
dq_part = word_part.DoubleQuotedPart()
left_dq_token = self.cur_token
dq_part.spids.append(left_dq_token.span_id) # Left "
self._ReadLikeDQ(left_dq_token, dq_part.parts)
dq_part.spids.append(self.cur_token.span_id) # Right "
return dq_part
def _ReadCommandSubPart(self, left_id):
# type: (Id_t) -> word_part__CommandSubPart
"""
NOTE: This is not in the grammar, because word parts aren't in the grammar!
command_sub = '$(' command_list ')'
| ` command_list `
| '<(' command_list ')'
| '>(' command_list ')'
"""
left_token = self.cur_token
left_spid = left_token.span_id
# Set the lexer in a state so ) becomes the EOF token.
if left_id in (Id.Left_DollarParen, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
self._Next(lex_mode_e.ShCommand) # advance past $( etc.
right_id = Id.Eof_RParen
self.lexer.PushHint(Id.Op_RParen, right_id)
c_parser = self.parse_ctx.MakeParserForCommandSub(self.line_reader,
self.lexer, right_id)
# NOTE: This doesn't use something like main_loop because we don't want to
# interleave parsing and execution! Unlike 'source' and 'eval'.
node = c_parser.ParseCommandSub()
right_spid = c_parser.w_parser.cur_token.span_id
elif left_id == Id.Left_Backtick and self.parse_ctx.one_pass_parse:
# NOTE: This is an APPROXIMATE solution for translation ONLY. See
# test/osh2oil.
right_id = Id.Eof_Backtick
self.lexer.PushHint(Id.Left_Backtick, right_id)
c_parser = self.parse_ctx.MakeParserForCommandSub(self.line_reader,
self.lexer, right_id)
node = c_parser.ParseCommandSub()
right_spid = c_parser.w_parser.cur_token.span_id
elif left_id == Id.Left_Backtick:
self._Next(lex_mode_e.Backtick) # advance past `
parts = []
while True:
self._Peek()
#print(self.cur_token)
if self.token_type == Id.Backtick_Quoted:
parts.append(self.cur_token.val[1:]) # remove leading \
elif self.token_type == Id.Backtick_Other:
parts.append(self.cur_token.val)
elif self.token_type == Id.Backtick_Right:
break
elif self.token_type == Id.Eof_Real:
# Note: this parse error is in the ORIGINAL context. No code_str yet.
p_die('Unexpected EOF while looking for closing backtick',
token=left_token)
else:
raise AssertionError
self._Next(lex_mode_e.Backtick)
# Calculate right SPID on CommandSubPart BEFORE re-parsing.
right_spid = self.cur_token.span_id
code_str = ''.join(parts)
#log('code %r', code_str)
# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
# won't have the same location info as MakeParserForCommandSub(), because
# the lexer is different.
arena = self.parse_ctx.arena
line_reader = reader.StringLineReader(code_str, arena)
c_parser = self.parse_ctx.MakeOshParser(line_reader)
arena.PushSource(source.Backticks(left_spid, right_spid))
try:
node = c_parser.ParseCommandSub()
finally:
arena.PopSource()
else:
raise AssertionError(left_id)
cs_part = word_part.CommandSubPart(node, left_token)
cs_part.spids.append(left_spid)
cs_part.spids.append(right_spid)
return cs_part
def ParseVar(self, kw_token):
# type: (token) -> command_t
"""
oil_var: 'var' <'oil_var' in grammar.pgen2>
Note that assignments must end with a newline or a semicolon. Unlike shell
assignments, we disallow:
var x = 42 | wc -l
var x = 42 && echo hi
"""
self._Next(lex_mode_e.Expr)
enode, last_token = self.parse_ctx.ParseOilAssign(self.lexer, grammar_nt.oil_var)
# Let the CommandParser see the Op_Semi or Op_Newline.
self.buffered_word = osh_word.TokenWord(last_token)
self._Next(lex_mode_e.ShCommand) # always back to this
return enode
def ParseSetVar(self, kw_token):
# type: (token) -> command_t
"""
setvar a[i] = 1
setvar i += 1
setvar i++
"""
self._Next(lex_mode_e.Expr)
enode, last_token = self.parse_ctx.ParseOilAssign(self.lexer,
grammar_nt.oil_setvar)
# Let the CommandParser see the Op_Semi or Op_Newline.
self.buffered_word = osh_word.TokenWord(last_token)
self._Next(lex_mode_e.ShCommand) # always back to this
return enode
def _ReadArithExpr(self):
# type: () -> arith_expr_t
"""Read and parse an arithmetic expression in various contexts.
$(( 1+2 ))
(( a=1+2 ))
${a[ 1+2 ]}
${a : 1+2 : 1+2}
See tests/arith-context.test.sh for ambiguous cases.
${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
TODO: Instead of having an eof_type. I think we should use just run the
arith parser until it's done. That will take care of both : and ]. We
switch the state back.
See the assertion in ArithParser.Parse() -- unexpected extra input.
"""
# calls self.ReadWord(lex_mode_e.Arith)
a_parser = tdop.TdopParser(arith_parse.SPEC, self)
anode = a_parser.Parse()
return anode
def _ReadArithSubPart(self):
# type: () -> word_part__ArithSubPart
"""
Read an arith substitution, which contains an arith expression, e.g.
$((a + 1)).
"""
left_span_id = self.cur_token.span_id
# The second one needs to be disambiguated in stuff like stuff like:
# $(echo $(( 1+2 )) )
self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
# could save the lexer/reader state here, and retry if the arithmetic parse
# fails. But we can almost always catch this at parse time. There could
# be some exceptions like:
# $((echo * foo)) # looks like multiplication
# $((echo / foo)) # looks like division
self._Next(lex_mode_e.Arith)
anode = self._ReadArithExpr()
if self.token_type != Id.Arith_RParen:
p_die('Expected first ) to end arith sub, got %r', self.cur_token.val,
token=self.cur_token)
self._Next(lex_mode_e.ShCommand) # TODO: This could be DQ or ARITH too
# PROBLEM: $(echo $(( 1 + 2 )) )
# Two right parens break the Id.Eof_RParen scheme
self._Peek()
if self.token_type != Id.Right_DollarDParen:
p_die('Expected second ) to end arith sub, got %r', self.cur_token.val,
token=self.cur_token)
right_span_id = self.cur_token.span_id
node = word_part.ArithSubPart(anode)
node.spids.append(left_span_id)
node.spids.append(right_span_id)
return node
def _ReadArithSub2Part(self):
# type: () -> word_part__ArithSubPart
"""Non-standard arith sub $[a + 1]."""
left_span_id = self.cur_token.span_id
self._Next(lex_mode_e.Arith)
anode = self._ReadArithExpr()
if self.token_type != Id.Arith_RBracket:
p_die('Expected ], got %r', self.cur_token.val, token=self.cur_token)
right_span_id = self.cur_token.span_id
node = word_part.ArithSubPart(anode)
node.spids.append(left_span_id)
node.spids.append(right_span_id)
return node
def ReadDParen(self):
# type: () -> Tuple[arith_expr_t, int]
"""Read ((1+ 2)) -- command context.
We're using the word parser because it's very similar to _ReadArithExpr
above.
"""
# The second one needs to be disambiguated in stuff like stuff like:
# TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
# Then you can get rid of this.
self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
self._Next(lex_mode_e.Arith)
anode = self._ReadArithExpr()
if self.token_type != Id.Arith_RParen:
p_die('Expected first ) to end arith statement, got %r',
self.cur_token.val, token=self.cur_token)
self._Next(lex_mode_e.ShCommand)
# PROBLEM: $(echo $(( 1 + 2 )) )
self._Peek()
if self.token_type != Id.Op_DRightParen:
p_die('Expected second ) to end arith statement, got %r',
self.cur_token.val, token=self.cur_token)
self._Next(lex_mode_e.ShCommand)
return anode, self.cur_token.span_id
def _NextNonSpace(self):
# type: () -> None
"""Same logic as _ReadWord, but for ReadForExpresion."""
while True:
self._Next(lex_mode_e.Arith)
self._Peek()
if self.token_kind not in (Kind.Ignored, Kind.WS):
break
def ReadForExpression(self):
# type: () -> command__ForExpr
"""Read ((i=0; i<5; ++i)) -- part of command context."""
self._NextNonSpace() # skip over ((
self._Peek()
if self.token_type == Id.Arith_Semi: # for (( ; i < 10; i++ ))
init_node = None
else:
init_node = self._ReadArithExpr()
self._NextNonSpace()
self._Peek()
if self.token_type == Id.Arith_Semi: # for (( ; ; i++ ))
cond_node = None
else:
cond_node = self._ReadArithExpr()
self._NextNonSpace()
self._Peek()
if self.token_type == Id.Arith_RParen: # for (( ; ; ))
update_node = None
else:
update_node = self._ReadArithExpr()
self._NextNonSpace()
self._Peek()
if self.token_type != Id.Arith_RParen:
p_die('Expected ) to end for loop expression, got %r',
self.cur_token.val, token=self.cur_token)
self._Next(lex_mode_e.ShCommand)
return command.ForExpr(init_node, cond_node, update_node)
def _ReadArrayLiteralPart(self):
# type: () -> word_part__ArrayLiteralPart
"""
a=(1 2 3)
TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
We want:
A=(['x']=1 ["x"]=2 [$x$y]=3)
Maybe allow this as a literal string? Because I think I've seen it before?
Or maybe force people to patch to learn the rule.
A=([x]=4)
Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
Maybe enforce that ALL have keys or NONE of have keys.
"""
self._Next(lex_mode_e.ShCommand) # advance past (
self._Peek()
if self.cur_token.id != Id.Op_LParen:
p_die('Expected ( after =, got %r', self.cur_token.val,
token=self.cur_token)
# MUST use a new word parser (with same lexer).
w_parser = WordParser(self.parse_ctx, self.lexer, self.line_reader)
words = []
while True:
w = w_parser.ReadWord(lex_mode_e.ShCommand)
if isinstance(w, word__TokenWord):
word_id = word.CommandId(w)
if word_id == Id.Right_ArrayLiteral:
break
# Unlike command parsing, array parsing allows embedded \n.
elif word_id == Id.Op_Newline: