Permalink
Fetching contributors…
Cannot retrieve contributors at this time
242 lines (209 sloc) 9.09 KB
-- Data types for the osh AST, aka "Lossless Syntax Tree".
--
-- Invariant: the source text can be reconstructed byte-for-byte from this
-- tree.
--
-- Exceptions:
-- * <<- here docs with leading tabs, since we don't want those for
-- conversion. We don't want files with mixed tabs and spaces.
-- * Found to be not strictly necessary for oil conversion
-- * foo() { } vs function foo { } -- ksh
--
-- The AST is composed of the builtin ASDL types (string, int, bool) and our
-- application type 'id', which is core.id_kind.Id.
-- Unrepresented:
-- * let arithmetic (rarely used)
-- * coprocesses -- one with arg and one without
-- * select block
-- * 1>&2- to close redirect
-- * case fallthrough ;& and ;;&
-- Represented but Not Parsed:
-- * LeftIndex -- a[foo]=bar (the arithmetic version is parsed)
-- * ArrayPair -- ([foo]=bar)
-- Parsed but Not Implemented
-- * extended glob
-- * <> redirect
-- TODO: Preserve these source differences:
-- * order of redirects: 'echo >out.txt hi' vs echo hi >out.txt
-- * In the printer, I want to preserve line breaks! foo \bar?
-- * parens
-- * 1 + 2*3 vs. 1 + (2*3) or even 1 + ((2*3))
-- * [[ (1 == 1) ]] vs [[ 1 == 1 ]]
-- * $(( 1 + 2 )) vs $[1 + 2] (bash-specific, used by Aboriginal)
-- TODO:
-- * remove redundancy of 'string val' in token. It should use the span.
module osh
{
-- A portion of a line, used for error messages.
-- TODO: Need arena_id, for different files
line_span = (int line_id, int col, int length)
-- A primitive token. NOTE: val is redundant with 'span_id' for now. If we
-- rewrite the parser in C++, we might care for memory footprint. But for
-- now this is convenient.
-- NOTE: identical strings can shared, if we care.
token = (id id, string val, int span_id)
-- Optional step for {100..50..-15}
braced_step = (int val, int negated)
bracket_op =
WholeArray(id op_id) -- * or @
| ArrayIndex(arith_expr expr)
suffix_op =
StringUnary(id op_id, word arg_word) -- e.g. ${v:-default}
| PatSub(word pat, word? replace, bool do_all, bool do_prefix, bool do_suffix)
-- begin is optional with ${array::1}
| Slice(arith_expr? begin, arith_expr? length)
-- TODO: Constructors should be scoped? array_item::Pair?
array_item =
ArrayWord(word w)
| ArrayPair(word key, word value)
word_part =
-- TODO: should be array_item* items. They CAN be mixed, like a=([x]=y z)
ArrayLiteralPart(word* words)
| LiteralPart(token token)
| EscapedLiteralPart(token token)
-- for 'foo=' and "${a:-}"
| EmptyPart()
| SingleQuotedPart(token left, token* tokens)
| DoubleQuotedPart(word_part* parts)
| SimpleVarSub(token token)
| BracedVarSub(token token,
id? prefix_op, -- prefix # or ! operators
bracket_op? bracket_op
suffix_op? suffix_op)
-- This should be token tilde, token rest
| TildeSubPart(token token)
-- For command sub and process sub: $(...) <(...) >(...)
| CommandSubPart(command command_list, token left_token)
| ArithSubPart(arith_expr anode)
-- {a,b,c}
| BracedAltPart(word* words)
-- {1..10} or {1..10..2}
| BracedIntRangePart(int start, int end, braced_step? step)
-- {a..f} or {a..f..2} or {a..f..-2}
| BracedCharRangePart(string start, string end, braced_step? step)
-- extended globs are parsed statically, unlike globs
| ExtGlobPart(token op, word* arms)
word =
TokenWord(token token)
-- A CompoundWord can contain any word_part except the Braced*Part.
-- We could model this with another variant type but it incurs runtime
-- overhead and seems like overkill. Note that DoubleQuotedPart can't
-- contain a SingleQuotedPart, etc. either.
| CompoundWord(word_part* parts)
-- A BracedWordTree is a word because it can appear in a command. It can
-- contains any type of word_part.
| BracedWordTree(word_part* parts)
-- For dynamic parsing of test/[ -- the string is already evaluated.
| StringWord(id id, string s)
-- TODO: Might want to preserve token here.
lhs_expr =
LhsName(string name)
| LhsIndexedName(string name, arith_expr index)
-- Need to preserve physical parens? Maybe need Parens node.
arith_expr =
-- TODO: need token
ArithVarRef(string name) -- variable without $
| ArithWord(word w) -- a string that looks like an integer
| UnaryAssign(id op_id, lhs_expr child)
| BinaryAssign(id op_id, lhs_expr left, arith_expr right)
| ArithUnary(id op_id, arith_expr child)
-- TODO: add token for divide by zero
| ArithBinary(id op_id, arith_expr left, arith_expr right)
| TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
| FuncCall(arith_expr func, arith_expr* args)
bool_expr =
WordTest(word w) -- e.g. [[ myword ]]
| BoolBinary(id op_id, word left, word right)
| BoolUnary(id op_id, word child)
| LogicalNot(bool_expr child)
| LogicalAnd(bool_expr left, bool_expr right)
| LogicalOr(bool_expr left, bool_expr right)
-- Notes about here docs:
-- * 'body' is required, but must be initialized after construction.
-- * here_end could be a word at parse time for the LST. Then the second
-- pass could StaticEval it to a string and set do_expansion.
-- * To reprint the here doc, we need the here_end delimiter, but it doesn't
-- matter at runtime. do_expansion is calculated from it.
-- TODO : id -> token for translation?
redir =
Redir(token op, int fd, word arg_word)
| HereDoc(token op, int fd,
word here_begin, -- e.g. EOF or 'EOF'
int here_end_span_id, -- this span is an entire line
word_part* stdin_parts -- one for each line
)
assign_op = Equal | PlusEqual
assign_pair = (lhs_expr lhs, assign_op op, word? rhs)
env_pair = (string name, word val)
-- Each arm tests one word against multiple words
case_arm = (word* pat_list, command* action)
if_arm = (command* cond, command* action)
iterable =
IterArgv
| IterArray(word* words)
-- TODO: Make field names consistent: child vs expr, etc.
command =
NoOp
-- TODO: respect order
| SimpleCommand(word* words, redir* redirects, env_pair* more_env)
| Sentence(command child, token terminator)
| Assignment(id keyword, string* flags, assign_pair* pairs)
| ControlFlow(token token, word? arg_word)
| Pipeline(command* children, bool negated, int* stderr_indices)
| AndOr(id* ops, command* children)
-- Part of for/while/until. Can have one or more children.
| DoGroup(command* children, redir* redirects)
-- A brace group is a compound command, with redirects. Can have one or
-- more children.
| BraceGroup(command* children, redir* redirects)
-- Can have one or more children.
| Subshell(command child, redir* redirects)
| DParen(arith_expr child, redir* redirects)
| DBracket(bool_expr expr, redir* redirects)
-- do_arg_iter: whether to implicitly loop over "$@"
-- TODO: Make iter_words a sum type. iterable for_words
| ForEach(string iter_name, word* iter_words, bool do_arg_iter,
command body, redir* redirects)
-- C-style for loop. Any of the 3 expressions can be omitted.
-- TODO: body is required, but only optional here because of initialization
-- order.
| ForExpr(arith_expr? init, arith_expr? cond, arith_expr? update,
command? body, redir* redirects)
| While(command* cond, command body, redir* redirects)
| Until(command* cond, command body, redir* redirects)
| If(if_arm* arms, command* else_action, redir* redirects)
| Case(word to_match, case_arm* arms, redir* redirects)
| FuncDef(string name, command body, redir* redirects)
| TimeBlock(command pipeline)
-- Most nodes optimize it out as command*, but there are a few places where
-- this is useful for type safety.
| CommandList(command* children)
-- For now, using stderr_indices representation because it's more compact.
-- |& in osh; |- in oil.
-- pipe_op = Pipe | PipeAndStderr
-- NOTE: Do we even need these types? Arena already has methods. They can
-- We just need to go from text -> text. For execution, we'll be compiling
-- to a different format. We also won't bootstrap with osh code -- only oil
-- code. shell can call oil builtins if necessary.
-- A node with full debug info
-- All other nodes should have span_id? int _loc or int _begin, int _end.
-- It can be further compressed perhaps, like a varint.
arena = (string* lines, line_span* spans, command root)
-- On-disk format for an entire file. Enough info so we can reconstruct the
-- text byte-for-byte.
whole_file = (string path, arena a)
-- In-memory format for all the functions snipped out of a file.
partial_file = (string path, arena* funcs)
-- Glob representation, for converting ${x//} to extended regexes.
-- Example: *.[ch] is:
-- GlobOp(<Glob_Star '*'>),
-- GlobLit(Glob_OtherLiteral, '.'),
-- CharClassExpr(False, 'ch') # from Glob_CleanLiterals token
glob_part =
GlobLit(id id, string s)
| GlobOp(id op_id) -- * or ?
| CharClass(bool negated, string* strs)
-- Char classes are opaque for now. If we ever need them:
-- * Collating symbols are [. .]
-- * Equivalence classes are [=
}