Permalink
Browse files

Lex and parse extended globs.

Add more tests too.  We're not executing/matching them yet.

This required a new lexer mode LexMode.EXTGLOB and a new token kind
Kind.ExtGlob.

Note:

I ran into an ambiguity with [[ !(1 == 2) ]], and solved it by defining
it away.  See doc/osh-manual.md.

[[ ! (1 == 2) ]] with a space is the correct syntax for a boolean
expression.

We have a simpler rule than Bash for this corner case.  shopt -s extglob
is a no-op.

Unrelated:
- Add a test case for the BASH_REGEX lexer mode.
  • Loading branch information...
Andy Chu
Andy Chu committed Oct 18, 2017
1 parent 467a4d8 commit 4e64a4d65858efebb74a7985291cc895901f7758
Showing with 217 additions and 14 deletions.
  1. +3 −0 core/id_kind.py
  2. +1 −1 core/word_eval.py
  3. +19 −3 doc/osh-manual.md
  4. +27 −4 osh/lex.py
  5. +39 −1 osh/lex_test.py
  6. +1 −1 osh/osh.asdl
  7. +63 −2 osh/word_parse.py
  8. +1 −1 spec/dbracket.test.sh
  9. +56 −0 spec/extended-glob.test.sh
  10. +6 −0 spec/regex.test.sh
  11. +1 −1 test/spec.sh
View
@@ -238,8 +238,11 @@ def _AddKinds(spec):
'FuncDef', # )
'CasePat', # )
'ArrayLiteral', # )
'ExtGlob', # )
])
spec.AddKind('ExtGlob', ['At', 'Star', 'Plus', 'QMark', 'Bang'])
# First position of var sub ${
# Id.VOp2_Pound -- however you can't tell the difference at first! It could
# be an op or a name. So it makes sense to base i on the state.
View
@@ -902,7 +902,7 @@ def _EvalWordPart(self, part, quoted=False):
return [runtime.StringPartValue(str(num), True, True)]
else:
raise AssertionError(part.tag)
raise AssertionError(part.__class__.__name__)
class _WordEvaluator:
View
@@ -33,15 +33,32 @@ Very good articles on bash errexit:
- http://mywiki.wooledge.org/BashFAQ/105
- http://fvue.nl/wiki/Bash:_Error_handling
## Notable Gotchas in Parsing
Arith Sub vs. Command Sub:
- Unlike bash, `$((` is always starts an arith sub. `$( (echo hi) )` is a
subshell inside a command sub. (This construct should be written
`({ echo hi;})` anyway.
Extended Glob vs. Negation of Expression:
- `[[ !(a == a) ]]` is always an extended glob.
- `[[ ! (a == a) ]]` is the negation of an equality test.
- In bash the rules are much more complicated, and depend on `shopt -s
extglob`. That flag is a no-op in OSH. OSH avoids dynamic parsing, while
bash does it in many places.
## Unicode
Encoding of programs should be utf-8.
But those programs can manipulate data in ANY encoding?
echo $'[\u03bc]' # C-escaped string
echo $'[\u03bc]' # C-escaped string
vs literal unicode vs. echo -e. $'' is preferred because it's statically parsed.
vs literal unicode vs. `echo -e`. `$''` is preferred because it's statically
parsed.
List of operations that are Unicode-aware:
@@ -55,7 +72,6 @@ List of operations that are Unicode-aware:
- ${s#.} # remove one character
- sorting [[ $a < $b ]] -- should use current locale? I guess that is like the
'sort' command.
- prompt string has time, which is locale-specific.
View
@@ -25,9 +25,9 @@
DBRACKET
SQ DQ DOLLAR_SQ
ARITH
EXTGLOB
VS_1 VS_2 VS_ARG_UNQ VS_ARG_DQ
BASH_REGEX
BASH_REGEX_CHARS
BASH_REGEX BASH_REGEX_CHARS
""".split())
# In oil, I hope to have these lexer modes:
@@ -162,6 +162,15 @@
R(r'.', Id.Lit_Other), # any other single char is a literal
]
# In OUTER and DBRACKET states.
_EXTGLOB_BEGIN = [
C('@(', Id.ExtGlob_At),
C('*(', Id.ExtGlob_Star),
C('+(', Id.ExtGlob_Plus),
C('?(', Id.ExtGlob_QMark),
C('!(', Id.ExtGlob_Bang),
]
_KEYWORDS = [
# NOTE: { is matched elsewhere
C('[[', Id.KW_DLeftBracket),
@@ -216,7 +225,7 @@ def IsKeyword(name):
# of <Lit_Chars "if">.
LEXER_DEF[LexMode.OUTER] = [
C('((', Id.Op_DLeftParen), # not allowed within [[
] + _KEYWORDS + _MORE_KEYWORDS + _UNQUOTED
] + _KEYWORDS + _MORE_KEYWORDS + _UNQUOTED + _EXTGLOB_BEGIN
# DBRACKET: can be like OUTER, except:
# - Don't really need redirects either... Redir_Less could be Op_Less
@@ -226,7 +235,21 @@ def IsKeyword(name):
C('!', Id.KW_Bang),
] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
ID_SPEC.LexerPairs(Kind.BoolBinary) + \
_UNQUOTED
_UNQUOTED + _EXTGLOB_BEGIN
# Inside an extended glob, most characters are literals, including spaces and
# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
# nested, so _EXTGLOB_BEGIN appears here.
#
# Example: echo @(<> <>|&&|'foo'|$bar)
LEXER_DEF[LexMode.EXTGLOB] = \
_BACKSLASH + _LEFT_SUBS + _VARS + _EXTGLOB_BEGIN + [
R(r'[^\\$`"\'|)@*+!?]+', Id.Lit_Chars),
C('|', Id.Op_Pipe),
C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
C('\0', Id.Eof_Real),
R('.', Id.Lit_Chars), # everything else is literal
]
LEXER_DEF[LexMode.BASH_REGEX] = [
# Match these literals first, and then the rest of the OUTER state I guess.
View
@@ -27,7 +27,8 @@ def _InitLexer(s):
class LexerTest(unittest.TestCase):
def assertTokensEqual(self, left, right):
self.assertTrue(TokensEqual(left, right))
self.assertTrue(
TokensEqual(left, right), 'Expected %r, got %r' % (left, right))
def testRead(self):
lexer = _InitLexer(CMD)
@@ -72,6 +73,43 @@ def testRead_VS_ARG_UNQ(self):
#t = l.Read(LexMode.VS_ARG_UNQ)
print(t)
def testExtGlob(self):
lexer = _InitLexer('@(foo|bar)')
t = lexer.Read(LexMode.OUTER)
self.assertTokensEqual(ast.token(Id.ExtGlob_At, '@('), t)
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), t)
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.Op_Pipe, '|'), t)
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.Lit_Chars, 'bar'), t)
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.Op_RParen, ')'), t)
# Individual cases
lexer = _InitLexer('@(')
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.ExtGlob_At, '@('), t)
lexer = _InitLexer('*(')
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.ExtGlob_Star, '*('), t)
lexer = _InitLexer('?(')
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.ExtGlob_QMark, '?('), t)
lexer = _InitLexer('$')
t = lexer.Read(LexMode.EXTGLOB)
self.assertTokensEqual(ast.token(Id.Lit_Chars, '$'), t)
def testBashRegexState(self):
lexer = _InitLexer('(foo|bar)')
View
@@ -86,7 +86,7 @@ module osh
-- {a..f} or {a..f..2} or {a..f..-2}
| BracedCharRangePart(string start, string end, int? step)
-- extended globs are parsed statically, unlike globs
| ExtGlobPart(id op_id, word* arms)
| ExtGlobPart(token op, word* arms)
word =
TokenWord(token token)
View
@@ -563,6 +563,57 @@ def _ReadLeftParts(self):
raise AssertionError('%s not handled' % self.cur_token)
def _ReadExtGlobPart(self):
"""
Grammar:
Item = CompoundWord | EPSILON # important: @(foo|) is allowed
LEFT = '@(' | '*(' | '+(' | '?(' | '!('
RIGHT = ')'
ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
CompoundWord includes ExtGlobPart
"""
left_token = self.cur_token
arms = []
#log('left %r', left_token)
self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
self._Next(LexMode.EXTGLOB) # advance past LEFT
read_word = False # did we just a read a word? To handle @(||).
while True:
self._Peek()
#log('t %r', self.cur_token)
if self.token_type == Id.Right_ExtGlob:
if not read_word:
arms.append(ast.CompoundWord())
break
elif self.token_type == Id.Op_Pipe:
if not read_word:
arms.append(ast.CompoundWord())
read_word = False
self._Next(LexMode.EXTGLOB)
# lex mode EXTGLOB should only produce these 4 kinds of tokens
elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
w = self._ReadCompoundWord(lex_mode=LexMode.EXTGLOB)
arms.append(w)
read_word = True
elif self.token_kind == Kind.Eof:
self.AddErrorContext(
'Unexpected EOF reading extended glob that began here',
token=left_token)
return None
else:
raise AssertionError('Unexpected token %r' % self.cur_token)
part = ast.ExtGlobPart(left_token, arms)
return part
def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
"""
Args:
@@ -894,6 +945,10 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok, lex_mode=LexMode.OUTER,
"""
Precondition: Looking at the first token of the first word part
Postcondition: Looking at the token after, e.g. space or operator
NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
could be an operator delimiting a compound word. Can we change lexer modes
and remove this special case?
"""
#print('_ReadCompoundWord', lex_mode)
word = ast.CompoundWord()
@@ -905,7 +960,7 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok, lex_mode=LexMode.OUTER,
self._Peek()
#print('CW',self.cur_token)
if allow_done and self.token_type == eof_type:
done = True # e.g. for ${}
done = True # e.g. for ${foo//pat/replace}
# Keywords like "for" are treated like literals
elif self.token_kind in (
@@ -937,6 +992,12 @@ def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok, lex_mode=LexMode.OUTER,
part = ast.SimpleVarSub(self.cur_token)
word.parts.append(part)
elif self.token_kind == Kind.ExtGlob:
part = self._ReadExtGlobPart()
if not part:
return None
word.parts.append(part)
elif self.token_kind == Kind.Left:
#print('_ReadLeftParts')
part = self._ReadLeftParts()
@@ -1077,7 +1138,7 @@ def _ReadWord(self, lex_mode):
elif self.token_kind in (
Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
Kind.BoolUnary, Kind.BoolBinary):
Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
# We're beginning a word. If we see Id.Lit_Pound, change to
# LexMode.COMMENT and read until end of line. (TODO: How to add comments
# to AST?)
View
@@ -63,7 +63,7 @@ var='one two'
# Notes on whitespace:
# - 1 and == need space seprating them, but ! and ( don't.
# - [[ needs whitesapce after it, but ]] doesn't need whitespace before it!
[[ ''||!(1 == 2)&&(2 == 2)]] && echo true
[[ ''||! (1 == 2)&&(2 == 2)]] && echo true
# stdout: true
# NOTE on the two cases below. We're comparing
View
@@ -53,6 +53,23 @@ touch _tmp/extglob2/{foo,bar}.cc _tmp/extglob2/{foo,bar,baz}.h
echo _tmp/extglob2/!(*.h)
# stdout: _tmp/extglob2/bar.cc _tmp/extglob2/foo.cc
### glob spaces
shopt -s extglob
mkdir -p _tmp/eg4
touch _tmp/eg4/a '_tmp/eg4/a b' _tmp/eg4/foo
argv.py _tmp/eg4/@(a b|foo)
# stdout: ['_tmp/eg4/a b', '_tmp/eg4/foo']
### glob other punctuation chars (lexer mode)
# mksh sorts them differently
shopt -s extglob
mkdir -p _tmp/eg5
cd _tmp/eg5
touch __{'<>','{}','|','#','&&'}
argv.py @('__<>'|__{}|__\||__#|__&&)
# stdout: ['__<>', '__|', '__{}', '__&&', '__#']
# OK mksh stdout: ['__#', '__&&', '__<>', '__{}', '__|']
### @ matches exactly one
[[ --verbose == --@(help|verbose) ]] && echo TRUE
[[ --oops == --@(help|verbose) ]] || echo FALSE
@@ -137,6 +154,20 @@ quoted='--@(help|verbose)'
# stdout-json: "TRUE\nTRUE\nFALSE\nFALSE\nFALSE\n"
# N-I mksh stdout-json: "FALSE\nFALSE\nFALSE\n"
### extglob empty string
shopt -s extglob
[[ '' == @(foo|bar) ]] || echo FALSE
[[ '' == @(foo||bar) ]] && echo TRUE
# stdout-json: "FALSE\nTRUE\n"
### extglob empty pattern
shopt -s extglob
[[ '' == @() ]] && echo TRUE
[[ '' == @(||) ]] && echo TRUE
[[ X == @() ]] || echo FALSE
[[ '|' == @(||) ]] || echo FALSE
# stdout-json: "TRUE\nTRUE\nFALSE\nFALSE\n"
### printing extglob in variable
# mksh does static parsing so it doesn't like this?
shopt -s extglob
@@ -175,4 +206,29 @@ for word in --help --verbose --unmatched -- -zxzx -; do
done
# stdout-json: "A\nA\nU\nB\nC\nD\n"
### Without shopt -s extglob
empty=''
str='x'
[[ $empty == !($str) ]] && echo TRUE # test glob match
[[ $str == !($str) ]] || echo FALSE
# stdout-json: "TRUE\nFALSE\n"
### Turning extglob on changes the meaning of [[ !(str) ]] in bash
empty=''
str='x'
[[ !($empty) ]] && echo TRUE # test if $empty is empty
[[ !($str) ]] || echo FALSE # test if $str is empty
shopt -s extglob # mksh doesn't have this
[[ !($empty) ]] && echo TRUE # negated glob
[[ !($str) ]] && echo TRUE # negated glob
# stdout-json: "TRUE\nFALSE\nTRUE\nTRUE\n"
# OK mksh stdout-json: "TRUE\nTRUE\nTRUE\n"
### With extglob on, !($str) on the left or right of == has different meanings
shopt -s extglob
empty=''
str='x'
[[ 1 == !($str) ]] && echo TRUE # glob match
[[ !($str) == 1 ]] || echo FALSE # test if empty
# NOTE: There cannot be a space between ! and (?
# stdout-json: "TRUE\nFALSE\n"
View
@@ -114,3 +114,9 @@ pat="^(a b)$"
# stdout: true
# N-I zsh stdout-json: ""
# N-I zsh status: 1
### Double quoted regex gets regex-escaped
[[ { =~ "{" ]] && echo true
# stdout: true
# N-I zsh status: 1
# N-I zsh stdout-json: ""
View
@@ -465,7 +465,7 @@ brace-expansion() {
}
regex() {
sh-spec spec/regex.test.sh --osh-failures-allowed 2 \
sh-spec spec/regex.test.sh --osh-failures-allowed 3 \
$BASH $ZSH $OSH "$@"
}

0 comments on commit 4e64a4d

Please sign in to comment.