/
osh.asdl
241 lines (209 loc) · 9.09 KB
/
osh.asdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
-- Data types for the osh AST, aka "Lossless Syntax Tree".
--
-- Invariant: the source text can be reconstructed byte-for-byte from this
-- tree.
--
-- Exceptions:
-- * <<- here docs with leading tabs, since we don't want those for
-- conversion. We don't want files with mixed tabs and spaces.
-- * Found to be not strictly necessary for oil conversion
-- * foo() { } vs function foo { } -- ksh
--
-- The AST is composed of the builtin ASDL types (string, int, bool) and our
-- application type 'id', which is core.id_kind.Id.
-- Unrepresented:
-- * let arithmetic (rarely used)
-- * coprocesses -- one with arg and one without
-- * select block
-- * 1>&2- to close redirect
-- * case fallthrough ;& and ;;&
-- Represented but Not Parsed:
-- * LeftIndex -- a[foo]=bar (the arithmetic version is parsed)
-- * ArrayPair -- ([foo]=bar)
-- Parsed but Not Implemented
-- * extended glob
-- * <> redirect
-- TODO: Preserve these source differences:
-- * order of redirects: 'echo >out.txt hi' vs echo hi >out.txt
-- * In the printer, I want to preserve line breaks! foo \bar?
-- * parens
-- * 1 + 2*3 vs. 1 + (2*3) or even 1 + ((2*3))
-- * [[ (1 == 1) ]] vs [[ 1 == 1 ]]
-- * $(( 1 + 2 )) vs $[1 + 2] (bash-specific, used by Aboriginal)
-- TODO:
-- * remove redundancy of 'string val' in token. It should use the span.
module osh
{
-- A portion of a line, used for error messages.
-- TODO: Need arena_id, for different files
line_span = (int line_id, int col, int length)
-- A primitive token. NOTE: val is redundant with 'span_id' for now. If we
-- rewrite the parser in C++, we might care for memory footprint. But for
-- now this is convenient.
-- NOTE: identical strings can shared, if we care.
token = (id id, string val, int span_id)
-- Optional step for {100..50..-15}
braced_step = (int val, int negated)
bracket_op =
WholeArray(id op_id) -- * or @
| ArrayIndex(arith_expr expr)
suffix_op =
StringUnary(id op_id, word arg_word) -- e.g. ${v:-default}
| PatSub(word pat, word? replace, bool do_all, bool do_prefix, bool do_suffix)
-- begin is optional with ${array::1}
| Slice(arith_expr? begin, arith_expr? length)
-- TODO: Constructors should be scoped? array_item::Pair?
array_item =
ArrayWord(word w)
| ArrayPair(word key, word value)
word_part =
-- TODO: should be array_item* items. They CAN be mixed, like a=([x]=y z)
ArrayLiteralPart(word* words)
| LiteralPart(token token)
| EscapedLiteralPart(token token)
-- for 'foo=' and "${a:-}"
| EmptyPart()
| SingleQuotedPart(token left, token* tokens)
| DoubleQuotedPart(word_part* parts)
| SimpleVarSub(token token)
| BracedVarSub(token token,
id? prefix_op, -- prefix # or ! operators
bracket_op? bracket_op
suffix_op? suffix_op)
-- This should be token tilde, token rest
| TildeSubPart(string prefix)
-- For command sub and process sub: $(...) <(...) >(...)
| CommandSubPart(command command_list, token left_token)
| ArithSubPart(arith_expr anode)
-- {a,b,c}
| BracedAltPart(word* words)
-- {1..10} or {1..10..2}
| BracedIntRangePart(int start, int end, braced_step? step)
-- {a..f} or {a..f..2} or {a..f..-2}
| BracedCharRangePart(string start, string end, braced_step? step)
-- extended globs are parsed statically, unlike globs
| ExtGlobPart(token op, word* arms)
word =
TokenWord(token token)
-- A CompoundWord can contain any word_part except the Braced*Part.
-- We could model this with another variant type but it incurs runtime
-- overhead and seems like overkill. Note that DoubleQuotedPart can't
-- contain a SingleQuotedPart, etc. either.
| CompoundWord(word_part* parts)
-- A BracedWordTree is a word because it can appear in a command. It can
-- contains any type of word_part.
| BracedWordTree(word_part* parts)
-- For dynamic parsing of test/[ -- the string is already evaluated.
| StringWord(id id, string s)
-- TODO: Might want to preserve token here.
lhs_expr =
LhsName(string name)
| LhsIndexedName(string name, arith_expr index)
-- Need to preserve physical parens? Maybe need Parens node.
arith_expr =
-- TODO: need token
ArithVarRef(string name) -- variable without $
| ArithWord(word w) -- a string that looks like an integer
| UnaryAssign(id op_id, lhs_expr child)
| BinaryAssign(id op_id, lhs_expr left, arith_expr right)
| ArithUnary(id op_id, arith_expr child)
-- TODO: add token for divide by zero
| ArithBinary(id op_id, arith_expr left, arith_expr right)
| TernaryOp(arith_expr cond, arith_expr true_expr, arith_expr false_expr)
| FuncCall(arith_expr func, arith_expr* args)
bool_expr =
WordTest(word w) -- e.g. [[ myword ]]
| BoolBinary(id op_id, word left, word right)
| BoolUnary(id op_id, word child)
| LogicalNot(bool_expr child)
| LogicalAnd(bool_expr left, bool_expr right)
| LogicalOr(bool_expr left, bool_expr right)
-- Notes about here docs:
-- * 'body' is required, but must be initialized after construction.
-- * here_end could be a word at parse time for the LST. Then the second
-- pass could StaticEval it to a string and set do_expansion.
-- * To reprint the here doc, we need the here_end delimiter, but it doesn't
-- matter at runtime. do_expansion is calculated from it.
-- TODO : id -> token for translation?
redir =
Redir(token op, int fd, word arg_word)
| HereDoc(token op, int fd,
word here_begin, -- e.g. EOF or 'EOF'
int here_end_span_id, -- this span is an entire line
word_part* stdin_parts -- one for each line
)
assign_op = Equal | PlusEqual
assign_pair = (lhs_expr lhs, assign_op op, word? rhs)
env_pair = (string name, word val)
-- Each arm tests one word against multiple words
case_arm = (word* pat_list, command* action)
if_arm = (command* cond, command* action)
iterable =
IterArgv
| IterArray(word* words)
-- TODO: Make field names consistent: child vs expr, etc.
command =
NoOp
-- TODO: respect order
| SimpleCommand(word* words, redir* redirects, env_pair* more_env)
| Sentence(command child, token terminator)
| Assignment(id keyword, string* flags, assign_pair* pairs)
| ControlFlow(token token, word? arg_word)
| Pipeline(command* children, bool negated, int* stderr_indices)
| AndOr(id* ops, command* children)
-- Part of for/while/until. Can have one or more children.
| DoGroup(command* children, redir* redirects)
-- A brace group is a compound command, with redirects. Can have one or
-- more children.
| BraceGroup(command* children, redir* redirects)
-- Can have one or more children.
| Subshell(command child, redir* redirects)
| DParen(arith_expr child, redir* redirects)
| DBracket(bool_expr expr, redir* redirects)
-- do_arg_iter: whether to implicitly loop over "$@"
-- TODO: Make iter_words a sum type. iterable for_words
| ForEach(string iter_name, word* iter_words, bool do_arg_iter,
command body, redir* redirects)
-- C-style for loop. Any of the 3 expressions can be omitted.
-- TODO: body is required, but only optional here because of initialization
-- order.
| ForExpr(arith_expr? init, arith_expr? cond, arith_expr? update,
command? body, redir* redirects)
| While(command* cond, command body, redir* redirects)
| Until(command* cond, command body, redir* redirects)
| If(if_arm* arms, command* else_action, redir* redirects)
| Case(word to_match, case_arm* arms, redir* redirects)
| FuncDef(string name, command body, redir* redirects)
| TimeBlock(command pipeline)
-- Most nodes optimize it out as command*, but there are a few places where
-- this is useful for type safety.
| CommandList(command* children)
-- For now, using stderr_indices representation because it's more compact.
-- |& in osh; |- in oil.
-- pipe_op = Pipe | PipeAndStderr
-- NOTE: Do we even need these types? Arena already has methods. They can
-- We just need to go from text -> text. For execution, we'll be compiling
-- to a different format. We also won't bootstrap with osh code -- only oil
-- code. shell can call oil builtins if necessary.
-- A node with full debug info
-- All other nodes should have span_id? int _loc or int _begin, int _end.
-- It can be further compressed perhaps, like a varint.
arena = (string* lines, line_span* spans, command root)
-- On-disk format for an entire file. Enough info so we can reconstruct the
-- text byte-for-byte.
whole_file = (string path, arena a)
-- In-memory format for all the functions snipped out of a file.
partial_file = (string path, arena* funcs)
-- Glob representation, for converting ${x//} to extended regexes.
-- Example: *.[ch] is:
-- GlobOp(<Glob_Star '*'>),
-- GlobLit(Glob_OtherLiteral, '.'),
-- CharClassExpr(False, 'ch') # from Glob_CleanLiterals token
glob_part =
GlobLit(id id, string s)
| GlobOp(id op_id) -- * or ?
| CharClass(bool negated, string* strs)
-- Char classes are opaque for now. If we ever need them:
-- * Collating symbols are [. .]
-- * Equivalence classes are [=
}