Greatly simplify the ~ detection.

This code is equivalent to the old algorithm, except we use the power of regexes in the lexer. We no longer "manufacture" tokens in word.TildeDetect(). Instead we just shuffle them between a LiteralPart and a TildeSubPart. All unit tests and spec tests pass.
oilshell · Aug 29, 2018 · f55817c · f55817c
1 parent 69d24d3
commit f55817c
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 78 deletions.
diff --git a/core/id_kind.py b/core/id_kind.py
@@ -152,7 +152,7 @@ def AddKinds(spec):
       # Either brace expansion or keyword for { and }
       'LBrace', 'RBrace', 'Comma',
       'DRightBracket',     # the ]] that matches [[, NOT a keyword
-      'Tilde',             # tilde expansion
+      'TildeLike',         # tilde expansion
       'Pound',             #  for comment or VAROP state
       'Slash', 'Percent',  #  / # % for patsub, NOT unary op
       'Digits',            # for lex_mode_e.ARITH

diff --git a/core/word.py b/core/word.py
@@ -279,78 +279,41 @@ def RightMostSpanForWord(w):
 # We only detect ~Lit_Chars and split.  So we might as well just write a regex.
 
 def TildeDetect(word):
-  """Detect tilde expansion.
-
-  If it needs to include a TildeSubPart, return a new word.  Otherwise return
-  None.
-
-  NOTE: This algorithm would be a simpler if
-  1. We could assume some regex for user names.
-  2. We didn't need to do brace expansion first, like {~foo,~bar}
-  OR
-  - If Lit_Slash were special (it is in the VAROP states, but not OUTER
-  state).  We could introduce another lexer mode after you hit Lit_Tilde?
-
-  So we have to scan all LiteralPart instances until they contain a '/'.
-
-  http://unix.stackexchange.com/questions/157426/what-is-the-regex-to-validate-linux-users
-  "It is usually recommended to only use usernames that begin with a lower
-  case letter or an underscore, followed by lower case letters, digits,
-  underscores, or dashes. They can end with a dollar sign. In regular
-  expression terms: [a-z_][a-z0-9_-]*[$]?
-
-  On Debian, the only constraints are that usernames must neither start with
-  a dash ('-') nor contain a colon (':') or a whitespace (space: ' ', end
-  of line: '\n', tabulation: '\t', etc.). Note that using a slash ('/') may
-  break the default algorithm for the definition of the user's home
-  directory.
+  """Detect tilde expansion in a word.
+
+  It might begin with  LiteralPart that needs to be turned into a TildeSubPart.
+  (It depends on whether the second token begins with slash).
+
+  If so, it return a new word.  Otherwise return None.
+
+  NOTE:
+  - The regex for Lit_TildeLike could be expanded.  Right now it's
+    conservative, like Lit_Chars without the /.
+  - It's possible to write this in a mutating style, since only the first token
+    is changed.  But note that we CANNOT know this during lexing.
   """
-  if not word.parts:
+  # NOTE: BracedWordTree, EmptyWord, etc. can't be tilde expanded
+  if word.tag != word_e.CompoundWord:
     return None
+
+  assert word.parts, word
+
   part0 = word.parts[0]
-  if _LiteralPartId(part0) != Id.Lit_Tilde:
+  if _LiteralPartId(part0) != Id.Lit_TildeLike:
     return None
 
-  prefix = ''
-  found_slash = False
-  # search for the next /
-  for i in range(1, len(word.parts)):
-    # Not a literal part, and we did NOT find a slash.  So there is no
-    # TildeSub applied.  This would be something like ~X$var, ~$var,
-    # ~$(echo), etc..  The slash is necessary.
-    if word.parts[i].tag != word_part_e.LiteralPart:
-      return None
-    val = word.parts[i].token.val
-    p = val.find('/')
-
-    if p == -1:  # no slash yet
-      prefix += val
-
-    elif p >= 0:
-      # e.g. for ~foo!bar/baz, extract "bar"
-      # NOTE: requires downcast to LiteralPart
-      pre, post = val[:p], val[p:]
-      prefix += pre
-      tilde_part = ast.TildeSubPart(prefix)
-      # NOTE: no span_id here.  It would be nicer to use a different algorithm
-      # that didn't require this.
-      t = ast.token(Id.Lit_Chars, post, const.NO_INTEGER)
-      remainder_part = ast.LiteralPart(t)
-      found_slash = True
-      break
-
-  w = ast.CompoundWord()
-  if found_slash:
-    w.parts.append(tilde_part)
-    w.parts.append(remainder_part)
-    j = i + 1
-    while j < len(word.parts):
-      w.parts.append(word.parts[j])
-      j += 1
-  else:
-    # The whole thing is a tilde sub, e.g. ~foo or ~foo!bar
-    w.parts.append(ast.TildeSubPart(prefix))
-  return w
+  if len(word.parts) == 1:  # can't be zero
+    tilde_part = ast.TildeSubPart(part0.token)
+    return ast.CompoundWord([tilde_part])
+
+  part1 = word.parts[1]
+  # NOTE: We could inspect the raw tokens.
+  if _LiteralPartId(part1) == Id.Lit_Chars and part1.token.val.startswith('/'):
+    tilde_part = ast.TildeSubPart(part0.token)
+    return ast.CompoundWord([tilde_part] + word.parts[1:])
+
+  # It could be something like '~foo:bar', which doesn't have a slash.
+  return None
 
 
 def TildeDetectAll(words):

diff --git a/core/word_eval.py b/core/word_eval.py
@@ -158,27 +158,32 @@ def _EvalProcessSub(self, part, id_):
     """
     raise NotImplementedError
 
-  def _EvalTildeSub(self, prefix):
+  def _EvalTildeSub(self, token):
     """Evaluates ~ and ~user.
 
     Args:
       prefix: The tilde prefix (possibly empty)
     """
-    if prefix == '':
+    if token.val == '~':
       # First look up the HOME var, and then env var
       val = self.mem.GetVar('HOME')
       assert val.tag == value_e.Str, val
       return val.s
 
     # For ~otheruser/src.  TODO: Should this be cached?
     # http://linux.die.net/man/3/getpwnam
+    name = token.val[1:]
     try:
-      e = pwd.getpwnam(prefix)
+      e = pwd.getpwnam(name)
     except KeyError:
-      s = '~' + prefix
+      # If not found, it's ~nonexistente.  TODO: In strict mode, this should be
+      # an error, kind of like failglob and nounset.  Perhaps strict-tilde or
+      # even strict-word-eval.
+      result = token.val
     else:
-      s = e.pw_dir
-    return s
+      result = e.pw_dir
+
+    return result
 
   def _EvalVarNum(self, var_num):
     assert var_num >= 0
@@ -754,7 +759,7 @@ def _EvalWordPart(self, part, part_vals, quoted=False):
     elif part.tag == word_part_e.TildeSubPart:
       # We never parse a quoted string into a TildeSubPart.
       assert not quoted
-      s = self._EvalTildeSub(part.prefix)
+      s = self._EvalTildeSub(part.token)
       v = runtime.StringPartValue(s, False)
       part_vals.append(v)
 

diff --git a/osh/lex.py b/osh/lex.py
@@ -196,6 +196,10 @@
   # Id.Lit_Chars.
   R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),
 
+  # For tilde expansion. The list of chars is Lit_Chars, but WITHOUT the /.  We
+  # want the next token after the tilde TildeLike token start with a /.
+  R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike),
+
   C('#', Id.Lit_Pound),  # For comments
 
   # Needs to be LONGER than any other
@@ -207,8 +211,6 @@
   C('{', Id.Lit_LBrace),
   C('}', Id.Lit_RBrace),  # Also for var sub ${a}
   C(',', Id.Lit_Comma),
-  C('~', Id.Lit_Tilde),  # For tilde expansion
-                         # TODO: Add the rest of Lit_Chars minus / here.
 
   R(r'[ \t\r]+', Id.WS_Space),
 

diff --git a/osh/osh.asdl b/osh/osh.asdl
@@ -82,7 +82,7 @@ module osh
                 bracket_op? bracket_op
                 suffix_op? suffix_op)
     -- This should be token tilde, token rest
-  | TildeSubPart(string prefix)
+  | TildeSubPart(token token)
     -- For command sub and process sub: $(...)  <(...)  >(...)
   | CommandSubPart(command command_list, token left_token)
   | ArithSubPart(arith_expr anode)

diff --git a/osh/word_parse_test.py b/osh/word_parse_test.py
@@ -313,6 +313,14 @@ def testArrayOp(self):
   def testTestOp(self):
     w = _assertReadWord(self, '${var:-default]}')
 
+  def testTildeLike(self):
+    w = _assertReadWord(self, '~/git/oilshell/oil')
+    w = _assertReadWord(self, '~andy/git/oilshell/oil')
+    w = _assertReadWord(self, '~andy_c/git/oilshell/oil')
+    w = _assertReadWord(self, '~andy.c/git/oilshell/oil')
+    w = _assertReadWord(self, '~andy-c/git/oilshell/oil')
+    w = _assertReadWord(self, '~andy-c:git/oilshell/oil')
+
   def testRead(self):
     CASES = [
         'ls "foo"',