-
-
Notifications
You must be signed in to change notification settings - Fork 145
/
grammar.pgen2
349 lines (298 loc) · 11 KB
/
grammar.pgen2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# Grammar for Oil.
# Adapted from the Python 3.7 expression grammar, with several changes!
# Oil patch: removed @= **= //=
# Note that we're missing div= and xor=, which now look weird. ^= is
# exponentiation. Honestly I don't even like '%='. |= has a use case.
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' )
# For normal and annotated assignments, additional restrictions enforced by the interpreter
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
# Oil patch: These used to be varargslist.
# TODO: I think we want fn(x) x+1 or |x| x+1.
lambdef: 'lambda' [name_type_list] ':' test
lambdef_nocond: 'lambda' [name_type_list] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: range_expr (comp_op range_expr)*
# Here the beginning and end are required
range_expr: expr [':' expr]
# Oil patch: removed legacy <>
comp_op: (
'<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'|
Arith_Tilde | Expr_NotTilde
)
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('xor' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
# Oil patch: removed '@' and '//' -> div
term: factor (('*'|'/'|'%'|'div') factor)*
factor: ('+'|'-'|'~') factor | power
# Oil patch: ** -> ^
# Also removed Python 3 'await'
power: atom trailer* ['^' factor]
atom: (
'(' [testlist_comp] ')'
| '[' [testlist_comp] ']'
| '{' [dict] '}'
# TODO: Also accept < > for fully-anchored? How does regexec work?
| '/' regex [re_flags] '/'
# NOTE: These atoms are are allowed in typed array literals
| Expr_Name | Expr_Null | Expr_True | Expr_False
# TODO: Allow suffixes on floats and decimals? What about in arrays?
| Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt
| dq_string | sq_string
| sh_command_sub | braced_var_sub | simple_var_sub
| sh_array_literal | array_literal
)
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
# var f = f(x)
trailer: (
'(' [arglist] ')'
| '[' subscriptlist ']'
| '.' Expr_Name
| '->' Expr_Name
| '::' Expr_Name
)
# e.g. setvar x->key = 0
place_trailer: (
'[' subscriptlist ']'
| '.' Expr_Name
| '->' Expr_Name
| '::' Expr_Name
)
# Oil patch: this is 'expr' instead of 'test'
# - 1:(3<4) doesn't make any sense.
# - And then this allows us to support a[3:] and a[:i] as special cases.
# - First class slices have to be written 0:n.
subscriptlist: subscript (',' subscript)* [',']
subscript: expr | [expr] ':' [expr]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
# Dict syntax resembles JavaScript
# https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property
#
# Examples:
# {age: 20} is like {'age': 20}
#
# x = 'age'
# d = {[x]: 20} # Evaluate x as a variable
# d = {["foo$x"]: 20} # Another expression
# d = {[x, y]: 20} # Tuple key
# d = {key1, key1: 123}
# Notes:
# - Value is optional when the key is a name, because it can be taken from the
# environment.
# - We don't have:
# - dict comprehensions. Maybe wait until LR parsing?
# - Splatting with **
# - I don't think we want set literals? It might be @{} or %{} or #{}
dict_pair: (
Expr_Name [':' test] |
'[' testlist ']' ':' test |
sq_string ':' test |
dq_string ':' test
)
dict: dict_pair (',' dict_pair)* [',']
# This how Python implemented dict comprehensions. We can probably do the
# same.
#
# dictorsetmaker: ( ((test ':' test | '**' expr)
# (comp_for | (',' (test ':' test | '**' expr))* [','])) |
# ((test | star_expr)
# (comp_for | (',' (test | star_expr))* [','])) )
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test '=' test |
'**' test |
'*' test )
comp_for: 'for' exprlist 'in' or_test ['if' test_nocond]
#
# Oil Expressions
#
word_part: Lit_Chars | Lit_Other
word: word_part*
# TODO: Change this to types and expressions, like
# @[1 2 3] @[(x) (y+1)] @[true false false]
array_item: (
# NOTE: Most of these occur in 'atom' above
Expr_Name | Expr_Null | Expr_True | Expr_False |
Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt |
dq_string | sq_string |
sh_command_sub | braced_var_sub | simple_var_sub |
'(' test ')'
)
array_literal: (
'@[' array_item* Op_RBracket
)
sh_array_literal: '@(' Expr_CastedDummy Right_ShArrayLiteral
sh_command_sub: '$(' Expr_CastedDummy Eof_RParen
# TODO: We also need r'' c'' "" c""
dq_string: '"' Expr_CastedDummy Right_DoubleQuote
sq_string: (Left_SingleQuoteRaw | Left_SingleQuoteC) Expr_CastedDummy Right_SingleQuote
braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace
simple_var_sub: (
# NOTE: Everything in Kind.VSub except VSub_Name because that's ${foo}
#
# Note: we could allow $foo and $0, but disallow the rest in favor of
# ${@} and ${-}? Meh it's too inconsistent.
VSub_DollarName | VSub_Number
| VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen
| VSub_QMark
# NOTE: $? should be STATUS because it's an integer.
)
#
# Assignment / Type Variables
#
# Several differences vs. Python:
#
# - no yield expression on RHS
# - no star expressions on either side (Python 3) *x, y = 2, *b
# - no multiple assignments like: var x = y = 3
# - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python
# - We're validating the lvalue here, instead of doing it in the "transformer".
# We have the 'var' prefix which helps.
# name_type use cases:
# for x Int, y Int
# [x for x Int, y Int in ...]
# var x Int, y Int = 3, 5
# func(x Int, y Int)
name_type: Expr_Name [type_expr]
name_type_list: name_type (',' name_type)*
place: Expr_Name place_trailer*
place_list: place (',' place)*
type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
# NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
# in command subs.
end_stmt: '}' | ';' | Op_Newline | Eof_Real
# TODO: oil_var should be oil_var_decl and use name_type_list
oil_var: place_list [type_expr] '=' testlist end_stmt
oil_setvar: place_list (augassign | '=') testlist end_stmt
# For $stringfunc(x, y=1) and @arrayfunc(a, b='s')
oil_arglist: '(' [arglist] ')'
# for if (x > 0) etc.
oil_expr: '(' testlist ')'
# e.g. return 1 + 2 * 3
command_expr: testlist end_stmt
# Example: for (a Int, b Int in expr) { ... }
oil_for: '(' place_list 'in' testlist ')'
# Examples: func print(msg Str, *args ; span_id Int = 0, token Token = None)
# proc rule(@argv, b Block) { }
# We have to put the opening { there for pgen2. TODO: Also accept :{ so the
# lexer knows to change modes.
oil_func_proc: Expr_Name ['(' params [';' params] ')'] [type_expr] '{'
#oil_func_proc: NAME '(' params* [';' params*] ')'
params: param (',' param)* [',']
# ... is for *args or **kwargs of any type, and @argv is for string args
param: Expr_Name [type_expr] ['=' expr] | '...' Expr_Name | '@' Expr_Name
#
# Regex Sublanguage
#
char_literal: Char_OneChar | Char_Hex | Char_Unicode4 | Char_Unicode8
# we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted
# The parser enforces that they are single strings
range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
# digit or a-z
# We have to do further validation of ranges later.
class_literal_term: (
range_char ['-' range_char ]
| '~' Expr_Name
# $mychars or ${mymodule.mychars}
| simple_var_sub | braced_var_sub
# e.g. 'abc' or "abc$mychars"
# NOTE: range_char has sq_string
| dq_string
# Reserved for [[.collating sequences.]] (Unicode)
| '.' Expr_Name
# Reserved for [[=character equivalents=]] (Unicode)
| '=' Expr_Name
# TODO: Do they actually work in bash/awk/egrep/sed/etc.?
)
class_literal: '[' class_literal_term+ ']'
# NOTE: Here is an example of where you can put ^ in the middle of a pattern in
# Python, and it matters!
# >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE)
# >>> r.findall('z\nfoo\nbeef\nfood\n')
# ['\nfoo', 'ef', '\nfood']
# >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE)
# r.findall('z\nfoo\nbeef\nfood\n')
# ['\nfoo', '\nfood']
re_atom: (
char_literal
# builtin regex like 'digit' or a regex reference like 'D'
| Expr_Name
# %begin or %end
| Expr_Symbol
| class_literal
# ~digit or ~ %boundary or ~[a-f]
| '~' [Expr_Name | Expr_Symbol | class_literal]
# Splice another expression
| '@' Expr_Name
# any %start %end are preferred
| '.' | '^' | '$'
# egrep has zero-width assertions \< and \>
# We could make them %< and %> or %startword %endword
| '<' | '>'
# literal STRINGS like $foo or ${module.foo}
| simple_var_sub | braced_var_sub
# In a language-independent spec, backslashes are disallowed within 'sq'.
# Write it with char literals outside strings: 'foo' \\ 'bar' \n
| sq_string | dq_string
# capturing group
| '(' regex ['as' name_type] ')'
# : is syntactic space for non-capturing group. (! would seem like negation.)
| ':' '(' regex ')'
# syntactic space for Perl-style backtracking
# !REF 1 !REF name
# !AHEAD(d+) !BEHIND(d+) !NOT_AHEAD(d+) !NOT_BEHIND(d+)
| '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
# Might want this obscure conditional construct. Can't use C-style ternary
# because '?' is a regex operator.
#| '{' regex 'if' regex 'else' regex '}'
# Others:
# PCRE has (?R ) for recursion? That could be !RECURSE()
# Note: .NET has && in character classes, making it a recursive language
)
# e.g. a{3} a{3,4} a{3,} a{,4} but not a{,}
repeat_range: (
Expr_DecInt [',']
| ',' Expr_DecInt
| Expr_DecInt ',' Expr_DecInt
)
repeat_op: (
'+' | '*' | '?'
# In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive"
# We use N and P modifiers within {}.
# a{L +} a{P ?} a{P 3,4} a{P ,4}
| '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
)
re_alt: (re_atom [repeat_op])+
regex: [re_alt] (('|'|'or') re_alt)*
# /digit+ ; multiline,ignorecase/
re_flag: ['~'] Expr_Name
re_flags: ';' re_flag (',' re_flag)*
# Syntax reserved for PCRE/Python, but that's not in ERE:
#
# nop-greedy a{N *}
# non-capturing :( digit+ )
# backtracking !REF 1 !AHEAD(d+)
#
# Legacy syntax:
#
# ^ and $ instead of %start and %end
# < and > instead of %start_word and %end_word
# . instead of dot
# | instead of 'or'