osh/osh.asdl

-- Data types for the osh AST, aka "Lossless Syntax Tree".
--
-- Invariant: the source text can be reconstructed byte-for-byte from this
-- tree.
--
-- Exceptions:
-- * <<- here docs with leading tabs, since we don't want those for
--   conversion.  We don't want files with mixed tabs and spaces.
-- * Found to be not strictly necessary for oil conversion
--   * foo() { } vs function foo { } -- ksh 
--
-- The AST is composed of the builtin ASDL types (string, int, bool) and our
-- application type 'id', which is core.id_kind.Id.

-- Unrepresented:
-- * let arithmetic (rarely used)
-- * coprocesses -- one with arg and one without
-- * select block
-- * 1>&2- to close redirect
-- * case fallthrough ;& and ;;&

-- Represented but Not Parsed:
-- * LeftIndex -- a[foo]=bar  (the arithmetic version is parsed)
-- * ArrayPair -- ([foo]=bar)

-- Parsed but Not Implemented
-- * extended glob
-- * <> redirect

-- TODO: Preserve these source differences:
-- * order of redirects: 'echo >out.txt hi'  vs echo hi >out.txt
--   * In the printer, I want to preserve line breaks!  foo \bar?
-- * parens
--   * 1 + 2*3  vs.  1 + (2*3)  or even 1 + ((2*3))
--   * [[ (1 == 1) ]] vs [[ 1 == 1 ]]
-- * $(( 1 + 2 )) vs $[1 + 2]  (bash-specific, used by Aboriginal)
-- TODO:
-- * remove redundancy of 'string val' in token.  It should use the span.

module osh
{
  -- A portion of a line, used for error messages.
  -- TODO: Need arena_id, for different files
  line_span = (int line_id, int col, int length)

  -- A primitive token.  NOTE: val is redundant with 'span_id' for now.  If we
  -- rewrite the parser in C++, we might care for memory footprint.  But for 
  -- now this is convenient.
  -- NOTE: identical strings can shared, if we care.
  token = (id id, string val, int span_id)

  -- Optional step for {100..50..-15}
  braced_step = (int val, int negated)

  bracket_op = 
    WholeArray(id op_id)  -- * or @
  | ArrayIndex(arith_expr expr)

  suffix_op = 
    StringUnary(id op_id, word arg_word)  -- e.g. ${v:-default}
  | PatSub(word pat, word? replace, bool do_all, bool do_prefix, bool do_suffix)
  -- begin is optional with ${array::1}
  | Slice(arith_expr? begin, arith_expr? length)

  -- TODO: Constructors should be scoped?  array_item::Pair?
  array_item = 
    ArrayWord(word w)
  | ArrayPair(word key, word value)

  word_part = 
    -- TODO: should be array_item* items.  They CAN be mixed, like a=([x]=y z)
    ArrayLiteralPart(word* words)
  | LiteralPart(token token)
  | EscapedLiteralPart(token token)
    -- for 'foo=' and "${a:-}"
  | EmptyPart()
  | SingleQuotedPart(token left, token* tokens)
  | DoubleQuotedPart(word_part* parts)
  | SimpleVarSub(token token)
  | BracedVarSub(token token,
                id? prefix_op,  -- prefix # or ! operators
                bracket_op? bracket_op
                suffix_op? suffix_op)
    -- This should be token tilde, token rest
  | TildeSubPart(string prefix)
    -- For command sub and process sub: $(...)  <(...)  >(...)
  | CommandSubPart(command command_list, token left_token)
  | ArithSubPart(arith_expr anode)
    -- {a,b,c}
  | BracedAltPart(word* words)
    -- {1..10} or {1..10..2}
  | BracedIntRangePart(int start, int end, braced_step? step)
    -- {a..f} or {a..f..2} or {a..f..-2}
  | BracedCharRangePart(string start, string end, braced_step? step)
  -- extended globs are parsed statically, unlike globs
  | ExtGlobPart(token op, word* arms)

  word = 
    TokenWord(token token)
    -- A CompoundWord can contain any word_part except the Braced*Part.
    -- We could model this with another variant type but it incurs runtime
    -- overhead and seems like overkill.  Note that DoubleQuotedPart can't
    -- contain a SingleQuotedPart, etc. either.
  | CompoundWord(word_part* parts)
    -- A BracedWordTree is a word because it can appear in a command.  It can
    -- contains any type of word_part.
  | BracedWordTree(word_part* parts)
    -- For dynamic parsing of test/[ -- the string is already evaluated.
  | StringWord(id id, string s)

  -- TODO: Might want to preserve token here.
  lhs_expr =
    LhsName(string name)
  | LhsIndexedName(string name, arith_expr index)

  -- Need to preserve physical parens?  Maybe need Parens node.
  arith_expr =
    -- TODO: need token
    ArithVarRef(string name)  -- variable without $
  | ArithWord(word w)  -- a string that looks like an integer

  | UnaryAssign(id op_id, lhs_expr child)
  | BinaryAssign(id op_id, lhs_expr left, arith_expr right)
  | ArithUnary(id op_id, arith_expr child)
  -- TODO: add token for divide by zero
  | ArithBinary(id op_id, arith_expr left, arith_expr right)
  | TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
  | FuncCall(arith_expr func, arith_expr* args)

  bool_expr =
    WordTest(word w)  -- e.g. [[ myword ]]
  | BoolBinary(id op_id, word left, word right)
  | BoolUnary(id op_id, word child)
  | LogicalNot(bool_expr child)
  | LogicalAnd(bool_expr left, bool_expr right)
  | LogicalOr(bool_expr left, bool_expr right)

  -- Notes about here docs:
  -- * 'body' is required, but must be initialized after construction.
  -- * here_end could be a word at parse time for the LST.  Then the second
  --   pass could StaticEval it to a string and set do_expansion.
  -- * To reprint the here doc, we need the here_end delimiter, but it doesn't
  --   matter at runtime.  do_expansion is calculated from it.
  -- TODO : id -> token for translation?

  redir = 
    Redir(token op, int fd, word arg_word)
  | HereDoc(token op, int fd,
            word here_begin,  -- e.g. EOF or 'EOF'
            int here_end_span_id,  -- this span is an entire line
            word_part* stdin_parts -- one for each line
           )

  assign_op = Equal | PlusEqual
  assign_pair = (lhs_expr lhs, assign_op op, word? rhs)
  env_pair = (string name, word val)

  -- Each arm tests one word against multiple words
  case_arm = (word* pat_list, command* action)
  if_arm = (command* cond, command* action)

  iterable = 
    IterArgv
  | IterArray(word* words)

  -- TODO: Make field names consistent: child vs expr, etc.

  command = 
    NoOp
    -- TODO: respect order 
  | SimpleCommand(word* words, redir* redirects, env_pair* more_env)
  | Sentence(command child, token terminator)
  | Assignment(id keyword, string* flags, assign_pair* pairs)
  | ControlFlow(token token, word? arg_word)
  | Pipeline(command* children, bool negated, int* stderr_indices)
  | AndOr(id* ops, command* children)
    -- Part of for/while/until.  Can have one or more children.
  | DoGroup(command* children, redir* redirects)
    -- A brace group is a compound command, with redirects.  Can have one or
    -- more children.
  | BraceGroup(command* children, redir* redirects)
    -- Can have one or more children.
  | Subshell(command child, redir* redirects)
  | DParen(arith_expr child, redir* redirects)
  | DBracket(bool_expr expr, redir* redirects)
    -- do_arg_iter: whether to implicitly loop over "$@"
    -- TODO: Make iter_words a sum type.  iterable for_words
  | ForEach(string iter_name, word* iter_words, bool do_arg_iter,
            command body, redir* redirects)
    -- C-style for loop.  Any of the 3 expressions can be omitted.
    -- TODO: body is required, but only optional here because of initialization
    -- order.
  | ForExpr(arith_expr? init, arith_expr? cond, arith_expr? update,
            command? body, redir* redirects)
  | While(command* cond, command body, redir* redirects)
  | Until(command* cond, command body, redir* redirects)
  | If(if_arm* arms, command* else_action, redir* redirects)
  | Case(word to_match, case_arm* arms, redir* redirects)
  | FuncDef(string name, command body, redir* redirects)
  | TimeBlock(command pipeline)
    -- Most nodes optimize it out as command*, but there are a few places where
    -- this is useful for type safety.
  | CommandList(command* children)

  -- For now, using stderr_indices representation because it's more compact.
  -- |& in osh; |- in oil.
  -- pipe_op = Pipe | PipeAndStderr

  --  NOTE: Do we even need these types?  Arena already has methods.  They can
  --  We just need to go from text -> text.  For execution, we'll be compiling
  -- to a different format.  We also won't bootstrap with osh code -- only oil
  -- code.  shell can call oil builtins if necessary.

  -- A node with full debug info
  -- All other nodes should have span_id?  int _loc or int _begin, int _end.
  -- It can be further compressed perhaps, like a varint.
  arena = (string* lines, line_span* spans, command root)

  -- On-disk format for an entire file.  Enough info so we can reconstruct the
  -- text byte-for-byte.
  whole_file = (string path, arena a)

  -- In-memory format for all the functions snipped out of a file.
  partial_file = (string path, arena* funcs)

  -- Glob representation, for converting ${x//} to extended regexes.

  -- Example: *.[ch] is:
  --   GlobOp(<Glob_Star '*'>),
  --   GlobLit(Glob_OtherLiteral, '.'),
  --   CharClassExpr(False, 'ch')  # from Glob_CleanLiterals token

  glob_part =
    GlobLit(id id, string s)
  | GlobOp(id op_id)  -- * or ?
  | CharClass(bool negated, string* strs)

  -- Char classes are opaque for now.  If we ever need them:
  -- * Collating symbols are [. .]
  -- * Equivalence classes are [=
}