Skip to content

Commit

Permalink
Overhaul the lexing and evaluation of [[ foo =~ foo$ ]].
Browse files Browse the repository at this point in the history
- The lexer mode is now simpler, and no longer derived from OUTER.
- Evaluation takes into account quoted parts in the word, which should
  be regex-escaped.

Motivated by an example in bash-completion, which I copied in the spec
test.

Also, change the spec tests to treat bash's behavior as authoritative.
zsh isn't relevant for our goals.
  • Loading branch information
Andy Chu committed Sep 24, 2018
1 parent 9b8557b commit ca71604
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 71 deletions.
18 changes: 11 additions & 7 deletions core/expr_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,12 +486,13 @@ def _SetRegexMatches(self, matches):
"""For ~= to set the BASH_REMATCH array."""
state.SetGlobalArray(self.mem, 'BASH_REMATCH', matches)

def _EvalCompoundWord(self, word, do_fnmatch=False):
def _EvalCompoundWord(self, word, do_fnmatch=False, do_ere=False):
"""
Args:
node: Id.Word_Compound
"""
val = self.word_ev.EvalWordToString(word, do_fnmatch=do_fnmatch)
val = self.word_ev.EvalWordToString(word, do_fnmatch=do_fnmatch,
do_ere=do_ere)
return val.s

def Eval(self, node):
Expand Down Expand Up @@ -590,7 +591,9 @@ def Eval(self, node):
# Whether to glob escape
do_fnmatch = op_id in (Id.BoolBinary_GlobEqual, Id.BoolBinary_GlobDEqual,
Id.BoolBinary_GlobNEqual)
s2 = self._EvalCompoundWord(node.right, do_fnmatch=do_fnmatch)
do_ere = (op_id == Id.BoolBinary_EqualTilde)
s2 = self._EvalCompoundWord(node.right, do_fnmatch=do_fnmatch,
do_ere=do_ere)

# Now dispatch on arg type
arg_type = BOOL_ARG_TYPES[op_id]
Expand Down Expand Up @@ -647,12 +650,13 @@ def Eval(self, node):
return s1 != s2

if op_id == Id.BoolBinary_EqualTilde:
regex_str = glob_.ExtendedRegexEscape(s2)
#log('%r -> %r', s2, regex_str)
#log('Matching %r against regex %r', s1, s2)
try:
matches = libc.regex_match(regex_str, s1)
matches = libc.regex_match(s2, s1)
except RuntimeError:
e_die("Invalid regex %r", s2, word=node.right)
# 2 means a parse error. Note this is a fatal error in OSH but not
# in bash.
e_die("Invalid regex %r", s2, word=node.right, status=2)

if matches is None:
return False
Expand Down
7 changes: 3 additions & 4 deletions core/glob_.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,9 @@ def GlobEscape(s):
# libc has a function to do this. Escape these characters:
# https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html Use

# NOTE: Weird bash rule: (|) are literal and don't have to be escaped
# The {} are the regex meta chars that are NOT bash meta chars?
# This is very gross.
ERE_META_CHARS = '{}'
# NOTE: Weird bash rule: (|) are literal and don't have to be escaped.
# The list of chars {}$ is determined by experience.
ERE_META_CHARS = '{}$'

def ExtendedRegexEscape(s):
"""
Expand Down
2 changes: 1 addition & 1 deletion core/test_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def ReadWord(self, unused_lex_mode):

class _WordEvaluator(object):

def EvalWordToString(self, w, do_fnmatch=False):
def EvalWordToString(self, w, do_fnmatch=False, do_ere=False):
# do_fnmatch: for the [[ == ]] semantics which we don't have!
# I think I need another type of node
# Maybe it should be BuiltinEqual and BuiltinDEqual? Parse it into a
Expand Down
4 changes: 3 additions & 1 deletion core/word_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ def _EvalWordToParts(self, word, quoted, part_vals):
else:
raise AssertionError(word.__class__.__name__)

def EvalWordToString(self, word, do_fnmatch=False):
def EvalWordToString(self, word, do_fnmatch=False, do_ere=False):
"""
Args:
word: CompoundWord
Expand Down Expand Up @@ -831,6 +831,8 @@ def EvalWordToString(self, word, do_fnmatch=False):
# [[ foo == */"*".py ]] or case *.py) ... esac
if do_fnmatch and not part_val.do_split_glob:
s = glob_.GlobEscape(part_val.s)
elif do_ere and not part_val.do_split_glob:
s = glob_.ExtendedRegexEscape(part_val.s)
else:
s = part_val.s
else:
Expand Down
12 changes: 4 additions & 8 deletions osh/bool_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,10 @@ def ParseFactor(self):

right = self.cur_word
if is_regex:
# Check syntax. TODO: We should have rhs = Dynamic | Static.
ok, s, unused_quoted = word.StaticEval(right)
if ok:
regex_str = glob_.ExtendedRegexEscape(s)
try:
libc.regex_parse(regex_str)
except RuntimeError as e:
p_die("Error parsing regex %r: %s", regex_str, e, word=right)
# NOTE: StaticEval for checking regex syntax isn't enough. We could
# need to pass do_ere so that the quoted parts get escaped.
#ok, s, unused_quoted = word.StaticEval(right)
pass

self._Next()
return ast.BoolBinary(op, left, right)
Expand Down
72 changes: 34 additions & 38 deletions osh/lex.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,28 +335,43 @@ def IsKeyword(name):
R(r'[^\0]', Id.Lit_Other), # everything else is literal
]

# Notes on BASH_REGEX states
#
# From bash manual:
#
# - Any part of the pattern may be quoted to force the quoted portion to be
# matched as a string.
# - Bracket expressions in regular expressions must be treated carefully, since
# normal quoting characters lose their meanings between brackets.
# - If the pattern is stored in a shell variable, quoting the variable
# expansion forces the entire pattern to be matched as a string.
#
# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
#
# TODO: For testing, write a script to extract and save regexes... and compile
# them with regcomp. I've only seen constant regexes.
#
# From code: ( | ) are treated special.

LEXER_DEF[lex_mode_e.BASH_REGEX] = [
# Match these literals first, and then the rest of the OUTER state I guess.
# That's how bash works.
#
# At a minimum, you do need $ and ~ expansions to happen. <>;& could have
# been allowed unescaped too, but that's not what bash does. The criteria
# was whether they were "special" in both languages, which seems dubious.

LEXER_DEF[lex_mode_e.BASH_REGEX] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
# and []. We will avoid that and ask the user to extract a variable.

C('(', Id.Lit_Chars),
C(')', Id.Lit_Chars),
C('|', Id.Lit_Chars),
] + [
# Avoid "unreachable rule error"
(is_regex, pat, re_list) for
(is_regex, pat, re_list) in _UNQUOTED
if not (is_regex == False and pat in ('(', ')', '|'))
]
# and []. We will avoid that and ask the user to extract a variable?

R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars), # not including period
R(r'[ \t\r]+', Id.WS_Space),

# From _BACKSLASH
R(r'\\[^\n\0]', Id.Lit_EscapedChar),
C('\\\n', Id.Ignored_LineCont),

#C('{', Id.Lit_RegexMeta), # { -> \{
#C('}', Id.Lit_RegexMeta), # } -> \}
# In [[ foo =~ foo$ ]], the $ doesn't get escaped
#C('$', Id.Lit_RegexMeta),

# NOTE: ( | and ) aren't operators!
R(r'[^\0]', Id.Lit_Other), # everything else is literal
]

LEXER_DEF[lex_mode_e.DQ] = [
# Only 4 characters are backslash escaped inside "".
Expand Down Expand Up @@ -507,25 +522,6 @@ def IsKeyword(name):
R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
]

# Notes on BASH_REGEX states
#
# From bash manual:
#
# - Any part of the pattern may be quoted to force the quoted portion to be
# matched as a string.
# - Bracket expressions in regular expressions must be treated carefully, since
# normal quoting characters lose their meanings between brackets.
# - If the pattern is stored in a shell variable, quoting the variable
# expansion forces the entire pattern to be matched as a string.
#
# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
#
# TODO: For testing, write a script to extract and save regexes... and compile
# them with regcomp. I've only seen constant regexes.
#
# From code: ( | ) are treated special.


# A lexer for the parser that converts globs to extended regexes. Since we're
# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
# don't need lexer modes here.
Expand Down
44 changes: 32 additions & 12 deletions spec/regex.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,18 @@ argv.py "${BASH_REMATCH[@]}"
#### Regex quoted with single quotes
# bash doesn't like the quotes
[[ 'a b' =~ '^(a b)$' ]] && echo true
## stdout: true
## status: 0
## OK bash stdout-json: ""
## OK bash status: 1
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0

#### Regex quoted with double quotes
# bash doesn't like the quotes
[[ 'a b' =~ "^(a b)$" ]] && echo true
## stdout: true
## status: 0
## OK bash stdout-json: ""
## OK bash status: 1
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0

#### Fix single quotes by storing in variable
pat='^(a b)$'
Expand All @@ -86,10 +86,10 @@ pat="^(a b)$"
#### Double quoting pat variable -- again bash doesn't like it.
pat="^(a b)$"
[[ 'a b' =~ "$pat" ]] && echo true
## stdout: true
## status: 0
## OK bash stdout-json: ""
## OK bash status: 1
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0

#### Regex with == and not =~ is parse error, different lexer mode required
# They both give a syntax error. This is lame.
Expand Down Expand Up @@ -146,6 +146,26 @@ status=0
status=1
## END

#### Escaped {
# from bash-completion
[[ '$PA' =~ ^(\$\{?)([A-Za-z0-9_]*)$ ]] && argv.py "${BASH_REMATCH[@]}"
## STDOUT:
['$PA', '$', 'PA']
## END
## BUG zsh stdout-json: ""
## BUG zsh status: 1

#### Escaped { stored in variable first
# from bash-completion
pat='^(\$\{?)([A-Za-z0-9_]*)$'
[[ '$PA' =~ $pat ]] && argv.py "${BASH_REMATCH[@]}"
## STDOUT:
['$PA', '$', 'PA']
## END
## BUG zsh STDOUT:
['']
## END

#### regex with ?
[[ 'c' =~ c? ]] && echo true
[[ '' =~ c? ]] && echo true
Expand Down

0 comments on commit ca71604

Please sign in to comment.