Permalink
Browse files

Greatly simplify the ~ detection.

This code is equivalent to the old algorithm, except we use the
power of regexes in the lexer.

We no longer "manufacture" tokens in word.TildeDetect().  Instead we
just shuffle them between a LiteralPart and a TildeSubPart.

All unit tests and spec tests pass.
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 29, 2018
1 parent 69d24d3 commit f55817c03923c0d412c0a011fea5719b4602e289
Showing with 56 additions and 78 deletions.
  1. +1 −1 core/id_kind.py
  2. +30 −67 core/word.py
  3. +12 −7 core/word_eval.py
  4. +4 −2 osh/lex.py
  5. +1 −1 osh/osh.asdl
  6. +8 −0 osh/word_parse_test.py
View
@@ -152,7 +152,7 @@ def AddKinds(spec):
# Either brace expansion or keyword for { and }
'LBrace', 'RBrace', 'Comma',
'DRightBracket', # the ]] that matches [[, NOT a keyword
'Tilde', # tilde expansion
'TildeLike', # tilde expansion
'Pound', # for comment or VAROP state
'Slash', 'Percent', # / # % for patsub, NOT unary op
'Digits', # for lex_mode_e.ARITH
View
@@ -279,78 +279,41 @@ def RightMostSpanForWord(w):
# We only detect ~Lit_Chars and split. So we might as well just write a regex.
def TildeDetect(word):
"""Detect tilde expansion.
If it needs to include a TildeSubPart, return a new word. Otherwise return
None.
NOTE: This algorithm would be a simpler if
1. We could assume some regex for user names.
2. We didn't need to do brace expansion first, like {~foo,~bar}
OR
- If Lit_Slash were special (it is in the VAROP states, but not OUTER
state). We could introduce another lexer mode after you hit Lit_Tilde?
So we have to scan all LiteralPart instances until they contain a '/'.
http://unix.stackexchange.com/questions/157426/what-is-the-regex-to-validate-linux-users
"It is usually recommended to only use usernames that begin with a lower
case letter or an underscore, followed by lower case letters, digits,
underscores, or dashes. They can end with a dollar sign. In regular
expression terms: [a-z_][a-z0-9_-]*[$]?
On Debian, the only constraints are that usernames must neither start with
a dash ('-') nor contain a colon (':') or a whitespace (space: ' ', end
of line: '\n', tabulation: '\t', etc.). Note that using a slash ('/') may
break the default algorithm for the definition of the user's home
directory.
"""Detect tilde expansion in a word.
It might begin with LiteralPart that needs to be turned into a TildeSubPart.
(It depends on whether the second token begins with slash).
If so, it return a new word. Otherwise return None.
NOTE:
- The regex for Lit_TildeLike could be expanded. Right now it's
conservative, like Lit_Chars without the /.
- It's possible to write this in a mutating style, since only the first token
is changed. But note that we CANNOT know this during lexing.
"""
if not word.parts:
# NOTE: BracedWordTree, EmptyWord, etc. can't be tilde expanded
if word.tag != word_e.CompoundWord:
return None
assert word.parts, word
part0 = word.parts[0]
if _LiteralPartId(part0) != Id.Lit_Tilde:
if _LiteralPartId(part0) != Id.Lit_TildeLike:
return None
prefix = ''
found_slash = False
# search for the next /
for i in range(1, len(word.parts)):
# Not a literal part, and we did NOT find a slash. So there is no
# TildeSub applied. This would be something like ~X$var, ~$var,
# ~$(echo), etc.. The slash is necessary.
if word.parts[i].tag != word_part_e.LiteralPart:
return None
val = word.parts[i].token.val
p = val.find('/')
if p == -1: # no slash yet
prefix += val
elif p >= 0:
# e.g. for ~foo!bar/baz, extract "bar"
# NOTE: requires downcast to LiteralPart
pre, post = val[:p], val[p:]
prefix += pre
tilde_part = ast.TildeSubPart(prefix)
# NOTE: no span_id here. It would be nicer to use a different algorithm
# that didn't require this.
t = ast.token(Id.Lit_Chars, post, const.NO_INTEGER)
remainder_part = ast.LiteralPart(t)
found_slash = True
break
w = ast.CompoundWord()
if found_slash:
w.parts.append(tilde_part)
w.parts.append(remainder_part)
j = i + 1
while j < len(word.parts):
w.parts.append(word.parts[j])
j += 1
else:
# The whole thing is a tilde sub, e.g. ~foo or ~foo!bar
w.parts.append(ast.TildeSubPart(prefix))
return w
if len(word.parts) == 1: # can't be zero
tilde_part = ast.TildeSubPart(part0.token)
return ast.CompoundWord([tilde_part])
part1 = word.parts[1]
# NOTE: We could inspect the raw tokens.
if _LiteralPartId(part1) == Id.Lit_Chars and part1.token.val.startswith('/'):
tilde_part = ast.TildeSubPart(part0.token)
return ast.CompoundWord([tilde_part] + word.parts[1:])
# It could be something like '~foo:bar', which doesn't have a slash.
return None
def TildeDetectAll(words):
View
@@ -158,27 +158,32 @@ def _EvalProcessSub(self, part, id_):
"""
raise NotImplementedError
def _EvalTildeSub(self, prefix):
def _EvalTildeSub(self, token):
"""Evaluates ~ and ~user.
Args:
prefix: The tilde prefix (possibly empty)
"""
if prefix == '':
if token.val == '~':
# First look up the HOME var, and then env var
val = self.mem.GetVar('HOME')
assert val.tag == value_e.Str, val
return val.s
# For ~otheruser/src. TODO: Should this be cached?
# http://linux.die.net/man/3/getpwnam
name = token.val[1:]
try:
e = pwd.getpwnam(prefix)
e = pwd.getpwnam(name)
except KeyError:
s = '~' + prefix
# If not found, it's ~nonexistente. TODO: In strict mode, this should be
# an error, kind of like failglob and nounset. Perhaps strict-tilde or
# even strict-word-eval.
result = token.val
else:
s = e.pw_dir
return s
result = e.pw_dir
return result
def _EvalVarNum(self, var_num):
assert var_num >= 0
@@ -754,7 +759,7 @@ def _EvalWordPart(self, part, part_vals, quoted=False):
elif part.tag == word_part_e.TildeSubPart:
# We never parse a quoted string into a TildeSubPart.
assert not quoted
s = self._EvalTildeSub(part.prefix)
s = self._EvalTildeSub(part.token)
v = runtime.StringPartValue(s, False)
part_vals.append(v)
View
@@ -196,6 +196,10 @@
# Id.Lit_Chars.
R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),
# For tilde expansion. The list of chars is Lit_Chars, but WITHOUT the /. We
# want the next token after the tilde TildeLike token start with a /.
R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike),
C('#', Id.Lit_Pound), # For comments
# Needs to be LONGER than any other
@@ -207,8 +211,6 @@
C('{', Id.Lit_LBrace),
C('}', Id.Lit_RBrace), # Also for var sub ${a}
C(',', Id.Lit_Comma),
C('~', Id.Lit_Tilde), # For tilde expansion
# TODO: Add the rest of Lit_Chars minus / here.
R(r'[ \t\r]+', Id.WS_Space),
View
@@ -82,7 +82,7 @@ module osh
bracket_op? bracket_op
suffix_op? suffix_op)
-- This should be token tilde, token rest
| TildeSubPart(string prefix)
| TildeSubPart(token token)
-- For command sub and process sub: $(...) <(...) >(...)
| CommandSubPart(command command_list, token left_token)
| ArithSubPart(arith_expr anode)
View
@@ -313,6 +313,14 @@ def testArrayOp(self):
def testTestOp(self):
w = _assertReadWord(self, '${var:-default]}')
def testTildeLike(self):
w = _assertReadWord(self, '~/git/oilshell/oil')
w = _assertReadWord(self, '~andy/git/oilshell/oil')
w = _assertReadWord(self, '~andy_c/git/oilshell/oil')
w = _assertReadWord(self, '~andy.c/git/oilshell/oil')
w = _assertReadWord(self, '~andy-c/git/oilshell/oil')
w = _assertReadWord(self, '~andy-c:git/oilshell/oil')
def testRead(self):
CASES = [
'ls "foo"',

0 comments on commit f55817c

Please sign in to comment.