Permalink
Browse files

Clean up comments, docstrings, and remove debug prints.

e.g. especially a big comment in word_parse.py.
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 31, 2018
1 parent 00e24bf commit 2c857ff91f61f78163e79b7fd32f1dc952bcfd3a
Showing with 58 additions and 109 deletions.
  1. +1 −1 core/braces.py
  2. +6 −16 core/legacy.py
  3. +4 −8 core/libstr.py
  4. +1 −2 core/process.py
  5. +1 −1 core/state.py
  6. +1 −4 core/util.py
  7. +1 −1 core/word.py
  8. +1 −3 osh/asdl_gen.py
  9. +1 −3 osh/ast_lib.py
  10. +0 −2 osh/bool_parse.py
  11. +0 −6 osh/cmd_parse.py
  12. +41 −62 osh/word_parse.py
View
@@ -1,7 +1,7 @@
#!/usr/bin/env python
from __future__ import print_function
"""
braces.py
braces.py - Implementation of {andy,bob}@example.com
NOTE: bash implements brace expansion in the braces.c file (835 lines). It
uses goto!
View
@@ -1,31 +1,21 @@
#!/usr/bin/env python
"""
legacy.py
legacy.py - Word Splitting
Nice blog post on the complexity/corner cases/difering intuition of splitting
Nice blog post on the complexity/corner cases/differing intuition of splitting
strings:
https://chriszetter.com/blog/2017/10/29/splitting-strings/
python-dev doesn't want to touch it anymore!
Other notes:
- How does this compare to awk -F?
Idea: This is discouraged/legacy, so write it in Oil rather than C++?
Problem: Need both classes and algebraic data types.
Do we have different splitters? Awk splitter might be useful. Regex
splitter later. CSV splitter?
LiteralSlice.
Other kinds of splitters:
Other possible splitters:
- AwkSplitter -- how does this compare to awk -F?
- RegexSplitter
- CsvSplitter
- TSV2Splitter -- this transforms because of # \u0065 in JSON. So it's not a
pure slice, but neither is IFS splitting because of backslashes.
- AwkSplitter
- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
a pure slice, but neither is IFS splitting because of backslashes.
- Perl?
- does perl have a spilt context?
View
@@ -2,15 +2,11 @@
"""
libstr.py - String library functions that can be exposed with a saner syntax.
Instead of
Instead of:
local y=${x//a*/b}
var y = x -> sub('a*', 'b', :ALL)
Or maybe:
var y = x -> sub( g/a*/, 'b', :ALL)
local y=${x//a*/b}
var y = x -> sub('a*', 'b', :ALL)
var y = x -> sub( Glob/a*/, 'b', :ALL) # maybe a glob literal
"""
import libc
View
@@ -7,8 +7,7 @@
# http://www.apache.org/licenses/LICENSE-2.0
from __future__ import print_function
"""
process.py - Runtime library for launching processes and manipulating file
descriptors.
process.py - Launch processes and manipulate file descriptors.
"""
import errno
View
@@ -7,7 +7,7 @@
# http://www.apache.org/licenses/LICENSE-2.0
from __future__ import print_function
"""
state.py -- Interpreter state
state.py - Interpreter state
"""
import cStringIO
View
@@ -7,10 +7,7 @@
# http://www.apache.org/licenses/LICENSE-2.0
from __future__ import print_function
"""
util.py -- Common infrastructure.
In some cases, we're using C++ idioms in Python, so the code translates more
easily to C++.
util.py - Common infrastructure.
"""
import cStringIO
View
@@ -1,5 +1,5 @@
"""
word.py -- Functions for using words as "tokens".
word.py - Utility functions for words, e.g. treating them as "tokens".
"""
from osh.meta import Id, Kind, LookupKind
View
@@ -1,9 +1,7 @@
#!/usr/bin/env python
from __future__ import print_function
"""
ast_gen.py: Generate the Id enum in C code.
# TODO: This should be renamed to asdl_gen.py
asdl_gen.py - Generate Python and C from OSH ASDL schemas.
"""
import os
View
@@ -1,8 +1,6 @@
#!/usr/bin/python
"""
ast_lib.py
Helpers for osh/osh.asdl
ast_lib.py - Helpers for osh/osh.asdl
"""
import sys
View
@@ -12,8 +12,6 @@
evaluation. So we are parsing a list of Word instances to an AST, rather than
a list of strings.
TODO: If we implement "test / [", we should share the parsing.
Grammar from http://compilers.iecc.com/crenshaw/tutor6.txt, adapted to ANTLR
syntax.
View
@@ -1119,7 +1119,6 @@ def ParseSubshell(self):
# Ensure that something $( (cd / && pwd) ) works. If ) is already on the
# translation stack, we want to delay it.
#print('ParseSubshell lexer.PushHint ) -> )')
self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
c_list = self.ParseCommandList()
@@ -1399,7 +1398,6 @@ def ParseCommandTerm(self):
done = False
while not done:
self._Peek()
#print('====> ParseCommandTerm word', self.cur_word)
# Most keywords are valid "first words". But do/done/then do not BEGIN
# commands, so they are not valid.
@@ -1460,10 +1458,8 @@ def ParseCommandList(self):
easier.
"""
self._NewlineOk()
node = self.ParseCommandTerm()
assert node is not None
assert node is not False
return node
def ParseWholeFile(self):
@@ -1479,7 +1475,6 @@ def ParseWholeFile(self):
"""
self._NewlineOk()
#print('ParseFile', self.c_kind, self.cur_word)
# An empty node to execute
if self.c_kind == Kind.Eof:
return ast.NoOp()
@@ -1488,7 +1483,6 @@ def ParseWholeFile(self):
# ParseCommandLine(), like oil.InteractiveLoop.
node = self.ParseCommandTerm()
assert node is not None
assert node is not False
# NOTE: This happens when there is no newline at the end of a file, like
# osh -c 'cat <<EOF'
View
@@ -7,6 +7,45 @@
# http://www.apache.org/licenses/LICENSE-2.0
"""
word_parse.py - Parse the shell word language.
Hairy example:
hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
Substitutions can be nested, but which inner subs are allowed depends on the
outer sub.
lex_mode_e.OUTER (_ReadLeftParts)
All subs and quotes are allowed:
$v ${v} $() `` $(()) '' "" $'' $"" <() >()
lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
Var, Command, Arith, but no quotes.
$v ${v} $() `` $(())
No process substitution.
lex_mode_e.ARITH
Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
need those for associative array indexing.
lex_mode_e.VS_ARG_UNQ
Like UNQUOTED, everything is allowed (even process substitutions), but we
stop at }, and space is SIGNIFICANT.
Example: ${a:- b }
${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
lex_mode_e.VS_ARG_DQ
In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
"${x:-"default"}".
In contrast, VS_ARG_UNQ respects single quotes and process substitution.
It's weird that double quotes are allowed. Space is also significant here,
e.g. "${x:-a "b"}".
"""
from osh.meta import Id, Kind, LookupKind
@@ -25,50 +64,6 @@
p_die = util.p_die
log = util.log
# Substitutions can be nested, but which inner subs are allowed depends on the
# outer sub. See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
# lex_mode_e.OUTER
# All subs and quotes are allowed --
# $v ${v} $() `` $(()) '' "" $'' $"" <() >()
#
# lex_mode_e.DQ
# Var, Command, Arith, but no quotes
# $v ${v} $() `` $(())
# No process substitution.
#
# lex_mode_e.ARITH:
# Similar to DQ: Var, Command, Arith sub. No process sub. bash has no
# quotes, but we are changing this in oil. We are adding ALL FOUR kinds of
# quotes , because we need those for associtative array indexing.
#
# lex_mode_e.VS_ARG_UNQ
# Like UNQUOTED, except we stop at }. Everything is allowed, even process
# substitution.
#
# ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
# ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
#
# But space is SIGNIFICANT. ${a:- b }
# So you should NOT just read a bunch of words after :-, unless you also
# preserve the space tokens between.
# In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
#
# lex_mode_e.VS_ARG_DQ
# Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
# token is not respected.
#
# Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
# literally), and process substitution is not respected (ditto).
#
# "" and $'' and $"" are respected, but not ''. I need a matrix for this.
#
# Like DQ, except nested "" and $'' and $"" are RESPECTED.
#
# It's weird that double quotes are allowed. Not sure why that would be.
# Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
# don't appear in the DQ context. I think I should parse those but DISALLOW.
# You should always make $'' and $"" as a separate var!
class WordParser(object):
@@ -409,7 +404,6 @@ def _ReadBracedBracedVarSub(self, d_quoted=False):
if ty == Id.VSub_Pound:
# Disambiguate
t = self.lexer.LookAhead(lex_mode_e.VS_1)
#print("\t# LOOKAHEAD", t)
if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
# e.g. a name, '#' is the prefix
self._Next(lex_mode_e.VS_1)
@@ -427,7 +421,6 @@ def _ReadBracedBracedVarSub(self, d_quoted=False):
elif ty == Id.VSub_Bang:
t = self.lexer.LookAhead(lex_mode_e.VS_1)
#print("\t! LOOKAHEAD", t)
if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
# e.g. a name, '!' is the prefix
# ${!a} -- this is a ref
@@ -506,7 +499,7 @@ def _ReadDoubleQuotedLeftParts(self):
raise AssertionError(self.cur_token)
def _ReadLeftParts(self):
"""Read substitutions and quoted strings."""
"""Read substitutions and quoted strings (for the OUTER context)."""
if self.token_type == Id.Left_DoubleQuote:
return self._ReadDoubleQuotedPart()
@@ -559,7 +552,6 @@ def _ReadExtGlobPart(self):
while True:
self._Peek()
#log('t %r', self.cur_token)
if self.token_type == Id.Right_ExtGlob:
if not read_word:
@@ -660,11 +652,9 @@ def _ReadCommandSubPart(self, token_type):
left_token = self.cur_token
left_spid = left_token.span_id
#print('_ReadCommandSubPart', self.cur_token)
self._Next(lex_mode_e.OUTER) # advance past $( or `
# Set the lexer in a state so ) becomes the EOF token.
#print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
if token_type in (
Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
@@ -680,7 +670,6 @@ def _ReadCommandSubPart(self, token_type):
assert node is not None
# Hm this creates its own word parser, which is thrown away?
#print('X', self.cur_token)
right_spid = c_parser.w_parser.cur_token.span_id
cs_part = ast.CommandSubPart(node, left_token)
@@ -784,7 +773,6 @@ def ReadDParen(self):
self._Next(lex_mode_e.ARITH)
anode = self._ReadArithExpr()
#print('xx ((', self.cur_token)
if self.token_type != Id.Arith_RParen:
p_die('Expected first ) to end arith statement, got %r',
self.cur_token.val, token=self.cur_token)
@@ -882,15 +870,13 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
could be an operator delimiting a compound word. Can we change lexer modes
and remove this special case?
"""
#print('_ReadCompoundWord', lex_mode)
word = ast.CompoundWord()
num_parts = 0
done = False
while not done:
allow_done = empty_ok or num_parts != 0
self._Peek()
#print('CW',self.cur_token)
if allow_done and self.token_type == eof_type:
done = True # e.g. for ${foo//pat/replace}
@@ -906,10 +892,7 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
word.parts.append(part)
if self.token_type == Id.Lit_VarLike:
#print('@', self.cursor)
#print('@', self.cur_token)
if self.token_type == Id.Lit_VarLike: # foo=
t = self.lexer.LookAhead(lex_mode_e.OUTER)
if t.id == Id.Op_LParen:
self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
@@ -927,7 +910,6 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
word.parts.append(part)
elif self.token_kind == Kind.Left:
#print('_ReadLeftParts')
part = self._ReadLeftParts()
assert part is not None
word.parts.append(part)
@@ -1022,7 +1004,6 @@ def _ReadWord(self, lex_mode):
word: Word, or None if there was an error, or need_more is set
need_more: True if the caller should call us again
"""
#print('_Read', lex_mode, self.cur_token)
self._Peek()
if self.token_kind == Kind.Eof:
@@ -1034,13 +1015,11 @@ def _ReadWord(self, lex_mode):
self._Next(lex_mode)
if self.token_type == Id.Op_Newline:
if self.cursor_was_newline:
#print('SKIP(nl)', self.cur_token)
return None, True
return ast.TokenWord(self.cur_token), False
elif self.token_kind == Kind.Right:
#print('WordParser.Read: Kind.Right', self.cur_token)
if self.token_type not in (
Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
Id.Right_ArrayLiteral):

0 comments on commit 2c857ff

Please sign in to comment.