Overhaul the lexing and evaluation of [[ foo =~ foo$ ]].

- The lexer mode is now simpler, and no longer derived from OUTER. - Evaluation takes into account quoted parts in the word, which should be regex-escaped. Motivated by an example in bash-completion, which I copied in the spec test. Also, change the spec tests to treat bash's behavior as authoritative. zsh isn't relevant for our goals.
oilshell · Sep 24, 2018 · ca71604 · ca71604
1 parent 9b8557b
commit ca71604
Show file tree

Hide file tree

Showing 7 changed files with 88 additions and 71 deletions.
diff --git a/core/expr_eval.py b/core/expr_eval.py
@@ -486,12 +486,13 @@ def _SetRegexMatches(self, matches):
     """For ~= to set the BASH_REMATCH array."""
     state.SetGlobalArray(self.mem, 'BASH_REMATCH', matches)
 
-  def _EvalCompoundWord(self, word, do_fnmatch=False):
+  def _EvalCompoundWord(self, word, do_fnmatch=False, do_ere=False):
     """
     Args:
       node: Id.Word_Compound
     """
-    val = self.word_ev.EvalWordToString(word, do_fnmatch=do_fnmatch)
+    val = self.word_ev.EvalWordToString(word, do_fnmatch=do_fnmatch,
+                                        do_ere=do_ere)
     return val.s
 
   def Eval(self, node):
@@ -590,7 +591,9 @@ def Eval(self, node):
       # Whether to glob escape
       do_fnmatch = op_id in (Id.BoolBinary_GlobEqual, Id.BoolBinary_GlobDEqual,
                              Id.BoolBinary_GlobNEqual)
-      s2 = self._EvalCompoundWord(node.right, do_fnmatch=do_fnmatch)
+      do_ere = (op_id == Id.BoolBinary_EqualTilde)
+      s2 = self._EvalCompoundWord(node.right, do_fnmatch=do_fnmatch,
+                                  do_ere=do_ere)
 
       # Now dispatch on arg type
       arg_type = BOOL_ARG_TYPES[op_id]
@@ -647,12 +650,13 @@ def Eval(self, node):
           return s1 != s2
 
         if op_id == Id.BoolBinary_EqualTilde:
-          regex_str = glob_.ExtendedRegexEscape(s2)
-          #log('%r -> %r', s2, regex_str)
+          #log('Matching %r against regex %r', s1, s2)
           try:
-            matches = libc.regex_match(regex_str, s1)
+            matches = libc.regex_match(s2, s1)
           except RuntimeError:
-            e_die("Invalid regex %r", s2, word=node.right)
+            # 2 means a parse error.  Note this is a fatal error in OSH but not
+            # in bash.
+            e_die("Invalid regex %r", s2, word=node.right, status=2)
 
           if matches is None:
             return False

diff --git a/core/glob_.py b/core/glob_.py
@@ -71,10 +71,9 @@ def GlobEscape(s):
 # libc has a function to do this.  Escape these characters:
 # https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html Use
 
-# NOTE: Weird bash rule: (|) are literal and don't have to be escaped
-# The {} are the regex meta chars that are NOT bash meta chars?
-# This is very gross.
-ERE_META_CHARS = '{}'
+# NOTE: Weird bash rule: (|) are literal and don't have to be escaped.
+# The list of chars {}$ is determined by experience.
+ERE_META_CHARS = '{}$'
 
 def ExtendedRegexEscape(s):
   """

diff --git a/core/test_builtin.py b/core/test_builtin.py
@@ -49,7 +49,7 @@ def ReadWord(self, unused_lex_mode):
 
 class _WordEvaluator(object):
 
-  def EvalWordToString(self, w, do_fnmatch=False):
+  def EvalWordToString(self, w, do_fnmatch=False, do_ere=False):
     # do_fnmatch: for the [[ == ]] semantics which we don't have!
     # I think I need another type of node
     # Maybe it should be BuiltinEqual and BuiltinDEqual?  Parse it into a

diff --git a/core/word_eval.py b/core/word_eval.py
@@ -803,7 +803,7 @@ def _EvalWordToParts(self, word, quoted, part_vals):
     else:
       raise AssertionError(word.__class__.__name__)
 
-  def EvalWordToString(self, word, do_fnmatch=False):
+  def EvalWordToString(self, word, do_fnmatch=False, do_ere=False):
     """
     Args:
       word: CompoundWord
@@ -831,6 +831,8 @@ def EvalWordToString(self, word, do_fnmatch=False):
         # [[ foo == */"*".py ]] or case *.py) ... esac
         if do_fnmatch and not part_val.do_split_glob:
           s = glob_.GlobEscape(part_val.s)
+        elif do_ere and not part_val.do_split_glob:
+          s = glob_.ExtendedRegexEscape(part_val.s)
         else:
           s = part_val.s
       else:

diff --git a/osh/bool_parse.py b/osh/bool_parse.py
@@ -217,14 +217,10 @@ def ParseFactor(self):
 
         right = self.cur_word
         if is_regex:
-          # Check syntax.  TODO: We should have rhs = Dynamic | Static.
-          ok, s, unused_quoted = word.StaticEval(right)
-          if ok:
-            regex_str = glob_.ExtendedRegexEscape(s)
-            try:
-              libc.regex_parse(regex_str)
-            except RuntimeError as e:
-              p_die("Error parsing regex %r: %s", regex_str, e, word=right)
+          # NOTE: StaticEval for checking regex syntax isn't enough.  We could
+          # need to pass do_ere so that the quoted parts get escaped.
+          #ok, s, unused_quoted = word.StaticEval(right)
+          pass
 
         self._Next()
         return ast.BoolBinary(op, left, right)

diff --git a/osh/lex.py b/osh/lex.py
@@ -335,28 +335,43 @@ def IsKeyword(name):
   R(r'[^\0]', Id.Lit_Other),  # everything else is literal
 ]
 
+# Notes on BASH_REGEX states
+#
+# From bash manual:
+#
+# - Any part of the pattern may be quoted to force the quoted portion to be
+# matched as a string.
+# - Bracket expressions in regular expressions must be treated carefully, since
+# normal quoting characters lose their meanings between brackets.
+# - If the pattern is stored in a shell variable, quoting the variable
+# expansion forces the entire pattern to be matched as a string.
+#
+# Is there a re.escape function?  It's just like EscapeGlob and UnescapeGlob.
+#
+# TODO: For testing, write a script to extract and save regexes... and compile
+# them with regcomp.  I've only seen constant regexes.
+#
+# From code: ( | ) are treated special.
 
-LEXER_DEF[lex_mode_e.BASH_REGEX] = [
-  # Match these literals first, and then the rest of the OUTER state I guess.
-  # That's how bash works.
-  #
-  # At a minimum, you do need $ and ~ expansions to happen.  <>;& could have
-  # been allowed unescaped too, but that's not what bash does.  The criteria
-  # was whether they were "special" in both languages, which seems dubious.
-
+LEXER_DEF[lex_mode_e.BASH_REGEX] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
   # NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
-  # and [].  We will avoid that and ask the user to extract a variable.
-
-  C('(', Id.Lit_Chars),
-  C(')', Id.Lit_Chars),
-  C('|', Id.Lit_Chars),
-] + [
-  # Avoid "unreachable rule error"
-  (is_regex, pat, re_list) for
-  (is_regex, pat, re_list) in _UNQUOTED
-  if not (is_regex == False and pat in ('(', ')', '|'))
-]
+  # and [].  We will avoid that and ask the user to extract a variable?
+
+  R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars),  # not including period
+  R(r'[ \t\r]+', Id.WS_Space),
+
+  # From _BACKSLASH
+  R(r'\\[^\n\0]', Id.Lit_EscapedChar),
+  C('\\\n', Id.Ignored_LineCont),
+
+  #C('{', Id.Lit_RegexMeta),    # { -> \{
+  #C('}', Id.Lit_RegexMeta),    # } -> \}
+  # In [[ foo =~ foo$ ]], the $ doesn't get escaped
+  #C('$', Id.Lit_RegexMeta),
 
+  # NOTE: ( | and ) aren't operators!
+  R(r'[^\0]', Id.Lit_Other),  # everything else is literal
+]
 
 LEXER_DEF[lex_mode_e.DQ] = [
   # Only 4 characters are backslash escaped inside "".
@@ -507,25 +522,6 @@ def IsKeyword(name):
   R(r'[^\0]', Id.Unknown_Tok)  # any char.  This should be a syntax error.
 ]
 
-# Notes on BASH_REGEX states
-#
-# From bash manual:
-#
-# - Any part of the pattern may be quoted to force the quoted portion to be
-# matched as a string.
-# - Bracket expressions in regular expressions must be treated carefully, since
-# normal quoting characters lose their meanings between brackets.
-# - If the pattern is stored in a shell variable, quoting the variable
-# expansion forces the entire pattern to be matched as a string.
-#
-# Is there a re.escape function?  It's just like EscapeGlob and UnescapeGlob.
-#
-# TODO: For testing, write a script to extract and save regexes... and compile
-# them with regcomp.  I've only seen constant regexes.
-#
-# From code: ( | ) are treated special.
-
-
 # A lexer for the parser that converts globs to extended regexes.  Since we're
 # only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
 # don't need lexer modes here.

diff --git a/spec/regex.test.sh b/spec/regex.test.sh
@@ -60,18 +60,18 @@ argv.py "${BASH_REMATCH[@]}"
 #### Regex quoted with single quotes
 # bash doesn't like the quotes
 [[ 'a b' =~ '^(a b)$' ]] && echo true
-## stdout: true
-## status: 0
-## OK bash stdout-json: ""
-## OK bash status: 1
+## stdout-json: ""
+## status: 1
+## OK zsh stdout: true
+## OK zsh status: 0
 
 #### Regex quoted with double quotes
 # bash doesn't like the quotes
 [[ 'a b' =~ "^(a b)$" ]] && echo true
-## stdout: true
-## status: 0
-## OK bash stdout-json: ""
-## OK bash status: 1
+## stdout-json: ""
+## status: 1
+## OK zsh stdout: true
+## OK zsh status: 0
 
 #### Fix single quotes by storing in variable
 pat='^(a b)$'
@@ -86,10 +86,10 @@ pat="^(a b)$"
 #### Double quoting pat variable -- again bash doesn't like it.
 pat="^(a b)$"
 [[ 'a b' =~ "$pat" ]] && echo true
-## stdout: true
-## status: 0
-## OK bash stdout-json: ""
-## OK bash status: 1
+## stdout-json: ""
+## status: 1
+## OK zsh stdout: true
+## OK zsh status: 0
 
 #### Regex with == and not =~ is parse error, different lexer mode required
 # They both give a syntax error.  This is lame.
@@ -146,6 +146,26 @@ status=0
 status=1
 ## END
 
+#### Escaped {
+# from bash-completion
+[[ '$PA' =~ ^(\$\{?)([A-Za-z0-9_]*)$ ]] && argv.py "${BASH_REMATCH[@]}"
+## STDOUT:
+['$PA', '$', 'PA']
+## END
+## BUG zsh stdout-json: ""
+## BUG zsh status: 1
+
+#### Escaped { stored in variable first
+# from bash-completion
+pat='^(\$\{?)([A-Za-z0-9_]*)$'
+[[ '$PA' =~ $pat ]] && argv.py "${BASH_REMATCH[@]}"
+## STDOUT:
+['$PA', '$', 'PA']
+## END
+## BUG zsh STDOUT:
+['']
+## END
+
 #### regex with ?
 [[ 'c' =~ c? ]] && echo true
 [[ '' =~ c? ]] && echo true