oil_lang/grammar.pgen2

# Grammar for Oil.
# Adapted from the Python 3.7 expression grammar, with several changes!

# Oil patch: removed @= **= //=
# Note that we're missing div= and xor=, which now look weird.  ^= is
# exponentiation.  Honestly I don't even like '%='.  |= has a use case.
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
            '<<=' | '>>=' )
# For normal and annotated assignments, additional restrictions enforced by the interpreter

test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond

# Oil patch: These used to be varargslist.
# TODO: I think we want fn(x) x+1 or |x| x+1.
lambdef: 'lambda' [name_type_list] ':' test
lambdef_nocond: 'lambda' [name_type_list] ':' test_nocond

or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: range_expr (comp_op range_expr)*

# Here the beginning and end are required
range_expr: expr [':' expr]

# Oil patch: removed legacy <>
comp_op: (
    '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'|
     Arith_Tilde | Expr_NotTilde
)
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('xor' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
# Oil patch: removed '@' and '//' -> div
term: factor (('*'|'/'|'%'|'div') factor)*
factor: ('+'|'-'|'~') factor | power
# Oil patch: ** -> ^
# Also removed Python 3 'await'
power: atom trailer* ['^' factor]
atom: (
    '(' [testlist_comp] ')' 
  | '[' [testlist_comp] ']' 
  | '{' [dict] '}'
    # TODO: Also accept < > for fully-anchored?  How does regexec work?
  | '/' regex [re_flags] '/'
    # NOTE: These atoms are are allowed in typed array literals
  | Expr_Name | Expr_Null | Expr_True | Expr_False 
    # TODO: Allow suffixes on floats and decimals?  What about in arrays?
  | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt 
  | dq_string | sq_string
  | sh_command_sub | braced_var_sub | simple_var_sub
  | sh_array_literal | array_literal 
)
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )

# var f = f(x)
trailer: (
    '(' [arglist] ')'
  | '[' subscriptlist ']'
  | '.' Expr_Name
  | '->' Expr_Name
  | '::' Expr_Name
)

# e.g. setvar x->key = 0
place_trailer: (
    '[' subscriptlist ']'
  | '.' Expr_Name
  | '->' Expr_Name
  | '::' Expr_Name
)

# Oil patch: this is 'expr' instead of 'test'
# - 1:(3<4) doesn't make any sense.
# - And then this allows us to support a[3:] and a[:i] as special cases.
# - First class slices have to be written 0:n.

subscriptlist: subscript (',' subscript)* [',']
subscript: expr | [expr] ':' [expr]

exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']

# Dict syntax resembles JavaScript
# https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property
#
# Examples:
# {age: 20} is like {'age': 20}
# 
# x = 'age'
# d = {[x]: 20}  # Evaluate x as a variable
# d = {["foo$x"]: 20}  # Another expression
# d = {[x, y]: 20}  # Tuple key
# d = {key1, key1: 123}
# Notes:
# - Value is optional when the key is a name, because it can be taken from the
# environment.
# - We don't have:
#   - dict comprehensions.  Maybe wait until LR parsing?
#   - Splatting with **
# - I don't think we want set literals?  It might be @{} or %{} or #{}

dict_pair: (
  Expr_Name [':' test] |
  '[' testlist ']' ':' test |
  sq_string ':' test |
  dq_string ':' test
)
dict: dict_pair (',' dict_pair)* [',']

# This how Python implemented dict comprehensions.  We can probably do the
# same.
#
# dictorsetmaker: ( ((test ':' test | '**' expr)
#                    (comp_for | (',' (test ':' test | '**' expr))* [','])) |
#                   ((test | star_expr)
#                    (comp_for | (',' (test | star_expr))* [','])) )

arglist: argument (',' argument)*  [',']

# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
            test '=' test |
            '**' test |
            '*' test )

comp_for: 'for' exprlist 'in' or_test ['if' test_nocond]


#
# Oil Expressions
#

word_part: Lit_Chars | Lit_Other
word: word_part*
# TODO: Change this to types and expressions, like
# @[1 2 3] @[(x) (y+1)] @[true false false]

array_item: (
   # NOTE: Most of these occur in 'atom' above
   Expr_Name | Expr_Null | Expr_True | Expr_False |
   Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt |
   dq_string | sq_string |
   sh_command_sub | braced_var_sub | simple_var_sub |
   '(' test ')'
)
array_literal: (
  '@[' array_item* Op_RBracket
)

sh_array_literal: '@(' Expr_CastedDummy Right_ShArrayLiteral
sh_command_sub: '$(' Expr_CastedDummy Eof_RParen

# TODO: We also need r'' c''  "" c""
dq_string: '"' Expr_CastedDummy Right_DoubleQuote
sq_string: (Left_SingleQuoteRaw | Left_SingleQuoteC) Expr_CastedDummy Right_SingleQuote

braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace

simple_var_sub: (
    # NOTE: Everything in Kind.VSub except VSub_Name because that's ${foo}
    #
    # Note: we could allow $foo and $0, but disallow the rest in favor of
    # ${@} and ${-}?  Meh it's too inconsistent.
    VSub_DollarName | VSub_Number
  | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen
  | VSub_QMark 
  # NOTE: $? should be STATUS because it's an integer.
)

#
# Assignment / Type Variables
#
# Several differences vs. Python:
#
# - no yield expression on RHS
# - no star expressions on either side (Python 3)    *x, y = 2, *b
# - no multiple assignments like: var x = y = 3
# - type annotation syntax is more restrictive    # a: (1+2) = 3 is OK in python
# - We're validating the lvalue here, instead of doing it in the "transformer".
#   We have the 'var' prefix which helps.

# name_type use cases:
#   for x Int, y Int
#   [x for x Int, y Int in ...]
#   var x Int, y Int = 3, 5
#   func(x Int, y Int)
name_type: Expr_Name [type_expr]
name_type_list: name_type (',' name_type)*

place: Expr_Name place_trailer*
place_list: place (',' place)*

type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]

# NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
# in command subs.
end_stmt: '}' | ';' | Op_Newline | Eof_Real 

# TODO: oil_var should be oil_var_decl and use name_type_list
oil_var: place_list [type_expr] '=' testlist end_stmt
oil_setvar: place_list (augassign | '=') testlist end_stmt

# For $stringfunc(x, y=1) and @arrayfunc(a, b='s')
oil_arglist: '(' [arglist] ')'

# for if (x > 0) etc.
oil_expr: '(' testlist ')'
# e.g. return 1 + 2 * 3
command_expr: testlist end_stmt

# Example: for (a Int, b Int in expr) { ... }
oil_for: '(' place_list 'in' testlist ')'

# Examples: func print(msg Str, *args ; span_id Int = 0, token Token = None)
#           proc rule(@argv, b Block) { }
# We have to put the opening { there for pgen2.  TODO: Also accept :{ so the
# lexer knows to change modes.
oil_func_proc: Expr_Name ['(' params [';' params] ')'] [type_expr] '{'
#oil_func_proc: NAME '(' params* [';' params*] ')'
params: param (',' param)* [',']
# ... is for *args or **kwargs of any type, and @argv is for string args
param: Expr_Name [type_expr] ['=' expr] | '...' Expr_Name | '@' Expr_Name

#
# Regex Sublanguage
#

char_literal: Char_OneChar | Char_Hex | Char_Unicode4 | Char_Unicode8

# we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted
# The parser enforces that they are single strings
range_char: Expr_Name | Expr_DecInt | sq_string | char_literal

# digit or a-z
# We have to do further validation of ranges later.
class_literal_term: (
    range_char ['-' range_char ]
  | '~' Expr_Name
    # $mychars or ${mymodule.mychars}
  | simple_var_sub | braced_var_sub
    # e.g. 'abc' or "abc$mychars" 
    # NOTE: range_char has sq_string
  | dq_string
    # Reserved for [[.collating sequences.]] (Unicode)
  | '.' Expr_Name
    # Reserved for [[=character equivalents=]] (Unicode)
  | '=' Expr_Name
    # TODO: Do they actually work in bash/awk/egrep/sed/etc.?

)
class_literal: '[' class_literal_term+ ']'

# NOTE: Here is an example of where you can put ^ in the middle of a pattern in
# Python, and it matters!
# >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE)
# >>> r.findall('z\nfoo\nbeef\nfood\n')
# ['\nfoo', 'ef', '\nfood']
# >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE)
# r.findall('z\nfoo\nbeef\nfood\n')
# ['\nfoo', '\nfood']

re_atom: (
    char_literal
    # builtin regex like 'digit' or a regex reference like 'D'
  | Expr_Name
    # %begin or %end
  | Expr_Symbol
  | class_literal
    # ~digit or ~ %boundary or ~[a-f]
  | '~' [Expr_Name | Expr_Symbol | class_literal]
    # Splice another expression
  | '@' Expr_Name
    # any %start %end are preferred
  | '.' | '^' | '$'
    # egrep has zero-width assertions \< and \>
    # We could make them %< and %> or %startword %endword
  | '<' | '>'
    # literal STRINGS like $foo or ${module.foo}
  | simple_var_sub | braced_var_sub
    # In a language-independent spec, backslashes are disallowed within 'sq'.
    # Write it with char literals outside strings: 'foo' \\ 'bar' \n
  | sq_string | dq_string
    # capturing group
  | '(' regex ['as' name_type] ')'
    # : is syntactic space for non-capturing group.  (! would seem like negation.)
  | ':' '(' regex ')'
    # syntactic space for Perl-style backtracking
    # !REF 1   !REF name
    # !AHEAD(d+)   !BEHIND(d+)  !NOT_AHEAD(d+)  !NOT_BEHIND(d+)
  | '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')

    # Might want this obscure conditional construct.  Can't use C-style ternary
    # because '?' is a regex operator.
  #| '{' regex 'if' regex 'else' regex '}'

  # Others:
  # PCRE has (?R ) for recursion?  That could be !RECURSE()
  # Note: .NET has && in character classes, making it a recursive language
)

# e.g.   a{3}   a{3,4}  a{3,}   a{,4} but not a{,}
repeat_range: (
    Expr_DecInt [',']
  | ',' Expr_DecInt
  | Expr_DecInt ',' Expr_DecInt
)

repeat_op: (
    '+' | '*' | '?' 

  # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive"
  # We use N and P modifiers within {}.
  # a{L +}  a{P ?}  a{P 3,4}  a{P ,4}
  | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
)

re_alt: (re_atom [repeat_op])+

regex: [re_alt] (('|'|'or') re_alt)*

# /digit+ ; multiline,ignorecase/
re_flag: ['~'] Expr_Name
re_flags: ';' re_flag (',' re_flag)*

# Syntax reserved for PCRE/Python, but that's not in ERE:
# 
#   nop-greedy     a{N *}
#   non-capturing  :( digit+ )
#   backtracking   !REF 1  !AHEAD(d+)
# 
# Legacy syntax:
# 
#   ^ and $ instead of %start and %end
#   < and > instead of %start_word and %end_word
#   . instead of dot
#   | instead of 'or'