oil_lang/grammar.pgen2

# Grammar for Python

# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/

# Start symbols for the grammar:
#       single_input is a single interactive statement;
#       file_input is a module or sequence of commands read from an input file;
#       eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: (NEWLINE | stmt)* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER

#decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
#decorators: decorator+
#decorated: decorators (classdef | funcdef | async_funcdef)

async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite

parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
        '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
      | '**' tfpdef [',']]]
  | '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
  | '**' tfpdef [','])
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
        '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
      | '**' vfpdef [',']]]
  | '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
  | '**' vfpdef [',']
)
vfpdef: NAME

stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
             import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
                     ('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' test]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
# Oil patch: removed @= **= //=
# Note that we're missing div= and xor=, which now look weird.  ^= is
# exponentiation.  Honestly I don't even like '%='.  |= has a use case.
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
            '<<=' | '>>=' )
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
              'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]

compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
           ((except_clause ':' suite)+
            ['else' ':' suite]
            ['finally' ':' suite] |
           'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)*  ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
# Oil patch: removed INDENT/DEDENT
suite: simple_stmt

test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# Oil patch: removed legacy <>
comp_op: '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
# Oil patch: removed '@' and '//' -> div
term: factor (('*'|'/'|'%'|'div') factor)*
factor: ('+'|'-'|'~') factor | power
# Oil patch: ** -> ^
power: atom_expr ['^' factor]
atom_expr: ['await'] atom trailer*
# Oil patch: removed STRING.  TODO: Add back Oil strings.
atom: ('(' [yield_expr|testlist_comp] ')' |
       '[' [testlist_comp] ']' |
       '{' [dictorsetmaker] '}' |
       NAME | NUMBER | '...' | Expr_Null | Expr_True | Expr_False |
       array_literal | sh_array_literal |
       command_sub | sh_command_sub |
       regex_literal |
       dq_string | sq_string | var_sub
)
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
                   (comp_for | (',' (test ':' test | '**' expr))* [','])) |
                  ((test | star_expr)
                   (comp_for | (',' (test | star_expr))* [','])) )

classdef: 'class' NAME ['(' [arglist] ')'] ':' suite

arglist: argument (',' argument)*  [',']

# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
            test '=' test |
            '**' test |
            '*' test )

comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]

# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME

yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist


#
# Oil Expressions
#

word_part: Lit_Chars | Lit_Other | Glob_Star | char_class
word: word_part*
array_literal: (
  '@[' [WS_Space] [word] (WS_Space word)* [WS_Space] Op_RBracket
)

sh_array_literal: '@(' Expr_WordsDummy Right_ArrayLiteral
sh_command_sub: '$(' Expr_CommandDummy Eof_RParen

char_class_part: Lit_Chars  # TODO: letters, escaped, and ranges
char_class: (
  Op_LBracket [WS_Space]
  [char_class_part] (WS_Space char_class_part)*
  [WS_Space] Op_RBracket
)

regex_part: Expr_Name | dq_string | char_class
# empty regex invalid for now
regex_literal: '$/' regex_part+ Arith_Slash

# line continuation is ignored by lexer
dq_part: (
  Lit_EscapedChar | Lit_Chars | Lit_Other | VSub_DollarName | VSub_Number |
  command_sub | sh_command_sub | var_sub
)

# TODO: We also need r'' c''  "" c""
dq_string: '"' Expr_DqDummy Right_DoubleQuote
sq_string: (Left_SingleQuoteRaw | Left_SingleQuoteC) Expr_SqDummy Right_SingleQuote

command_sub: (
  '$[' [WS_Space] [word] (WS_Space word)* [WS_Space] Op_RBracket
)

var_sub: '${' expr Op_RBrace

#
# Assignment / Type Variables
#
# Several differences vs. Python:
#
# - no yield expression on RHS
# - no star expressions on either side (Python 3)    *x, y = 2, *b
# - no multiple assignments like: var x = y = 3
# - type annotation syntax is more restrictive    # a: (1+2) = 3 is OK in python
# - We're validating the lvalue here, instead of doing it in the "transformer".
#   We have the 'var' prefix which helps.
#
# TODO: Get rid of lvalue
# place: ...
# oil_setvar: place_list
# oil_var: typed_var_list = 

lvalue_trailer: '[' subscriptlist ']' | '.' NAME
lvalue: NAME trailer*
lvalue_list: lvalue (',' lvalue)*

type_expr: NAME [ '[' type_expr (',' type_expr)* ']' ]

# NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
# in command subs.
end_stmt: '}' | ';' | Op_Newline | Eof_Real 
oil_var: lvalue_list [type_expr] '=' testlist end_stmt
oil_setvar: lvalue_list (augassign | '=') testlist end_stmt

# For $stringfunc(x, y=1) and @arrayfunc(a, b='s')
oil_arglist: '(' [arglist] ')'

# for if (x > 0) etc.
oil_expr: '(' testlist ')'
# e.g. return 1 + 2 * 3
return_expr: testlist end_stmt

# Example: for (a Int, b Int in expr) { ... }
oil_for: '(' lvalue_list 'in' testlist ')'

# Examples: func print(msg Str, *args ; span_id Int = 0, token Token = None)
#           proc rule(@argv, b Block) { }
# We have to put the opening { there for pgen2.  TODO: Also accept :{ so the
# lexer knows to change modes.
oil_func_proc: NAME ['(' params [';' params] ')'] [type_expr] '{'
#oil_func_proc: NAME '(' params* [';' params*] ')'
params: param (',' param)* [',']
# ... is for *args or **kwargs of any type, and @argv is for string args
param: NAME [type_expr] ['=' expr] | '...' NAME | '@' NAME

# Problem: we can't end it on a newline because say dict literals will have
# multiple lines.  But EOF isn't right either.
#oil_var: lvalue_list [type_expr] '=' testlist (Op_Semi | Op_Newline)

#oil_setvar: lvalue_list (augassign | '=') testlist (Op_Semi | Op_Newline)


#
# Blocks
# 
# Hard to parse:
#
# echo = 1
# echo -x +y  # is this allowed?
# echo('x' + 'y')
# echo.method()
# 
# hard_stmt: NAME ('=' | '('_ | word*)
# 
# Should you allow hyphens in names?  A minus sign will never appear first.
# x-1  # this is always a command
# must be expr x-1 or so forth