/
grammar.pgen2
277 lines (239 loc) · 9.89 KB
/
grammar.pgen2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: (NEWLINE | stmt)* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
#decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
#decorators: decorator+
#decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' test]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
# Oil patch: removed @= **= //=
# Note that we're missing div= and xor=, which now look weird. ^= is
# exponentiation. Honestly I don't even like '%='. |= has a use case.
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' )
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
# Oil patch: removed INDENT/DEDENT
suite: simple_stmt
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# Oil patch: removed legacy <>
comp_op: '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
# Oil patch: removed '@' and '//' -> div
term: factor (('*'|'/'|'%'|'div') factor)*
factor: ('+'|'-'|'~') factor | power
# Oil patch: ** -> ^
power: atom_expr ['^' factor]
atom_expr: ['await'] atom trailer*
# Oil patch: removed STRING. TODO: Add back Oil strings.
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | '...' | Expr_Null | Expr_True | Expr_False |
array_literal | sh_array_literal |
command_sub | sh_command_sub |
regex_literal |
dq_string | sq_string | var_sub
)
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist
#
# Oil Expressions
#
word_part: Lit_Chars | Lit_Other | Glob_Star | char_class
word: word_part*
array_literal: (
'@[' [WS_Space] [word] (WS_Space word)* [WS_Space] Op_RBracket
)
sh_array_literal: '@(' Expr_WordsDummy Right_ArrayLiteral
sh_command_sub: '$(' Expr_CommandDummy Eof_RParen
char_class_part: Lit_Chars # TODO: letters, escaped, and ranges
char_class: (
Op_LBracket [WS_Space]
[char_class_part] (WS_Space char_class_part)*
[WS_Space] Op_RBracket
)
regex_part: Expr_Name | dq_string | char_class
# empty regex invalid for now
regex_literal: '$/' regex_part+ Arith_Slash
# line continuation is ignored by lexer
dq_part: (
Lit_EscapedChar | Lit_Chars | Lit_Other | VSub_DollarName | VSub_Number |
command_sub | sh_command_sub | var_sub
)
# TODO: We also need r'' c'' "" c""
dq_string: '"' Expr_DqDummy Right_DoubleQuote
sq_string: (Left_SingleQuoteRaw | Left_SingleQuoteC) Expr_SqDummy Right_SingleQuote
command_sub: (
'$[' [WS_Space] [word] (WS_Space word)* [WS_Space] Op_RBracket
)
var_sub: '${' expr Op_RBrace
#
# Assignment / Type Variables
#
# Several differences vs. Python:
#
# - no yield expression on RHS
# - no star expressions on either side (Python 3) *x, y = 2, *b
# - no multiple assignments like: var x = y = 3
# - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python
# - We're validating the lvalue here, instead of doing it in the "transformer".
# We have the 'var' prefix which helps.
#
# TODO: Get rid of lvalue
# place: ...
# oil_setvar: place_list
# oil_var: typed_var_list =
lvalue_trailer: '[' subscriptlist ']' | '.' NAME
lvalue: NAME trailer*
lvalue_list: lvalue (',' lvalue)*
type_expr: NAME [ '[' type_expr (',' type_expr)* ']' ]
# NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
# in command subs.
end_stmt: '}' | ';' | Op_Newline | Eof_Real
oil_var: lvalue_list [type_expr] '=' testlist end_stmt
oil_setvar: lvalue_list (augassign | '=') testlist end_stmt
# For $stringfunc(x, y=1) and @arrayfunc(a, b='s')
oil_arglist: '(' [arglist] ')'
# for if (x > 0) etc.
oil_expr: '(' testlist ')'
# e.g. return 1 + 2 * 3
return_expr: testlist end_stmt
# Example: for (a Int, b Int in expr) { ... }
oil_for: '(' lvalue_list 'in' testlist ')'
# Examples: func print(msg Str, *args ; span_id Int = 0, token Token = None)
# proc rule(@argv, b Block) { }
# We have to put the opening { there for pgen2. TODO: Also accept :{ so the
# lexer knows to change modes.
oil_func_proc: NAME ['(' params [';' params] ')'] [type_expr] '{'
#oil_func_proc: NAME '(' params* [';' params*] ')'
params: param (',' param)* [',']
# ... is for *args or **kwargs of any type, and @argv is for string args
param: NAME [type_expr] ['=' expr] | '...' NAME | '@' NAME
# Problem: we can't end it on a newline because say dict literals will have
# multiple lines. But EOF isn't right either.
#oil_var: lvalue_list [type_expr] '=' testlist (Op_Semi | Op_Newline)
#oil_setvar: lvalue_list (augassign | '=') testlist (Op_Semi | Op_Newline)
#
# Blocks
#
# Hard to parse:
#
# echo = 1
# echo -x +y # is this allowed?
# echo('x' + 'y')
# echo.method()
#
# hard_stmt: NAME ('=' | '('_ | word*)
#
# Should you allow hyphens in names? A minus sign will never appear first.
# x-1 # this is always a command
# must be expr x-1 or so forth