Skip to content

Commit

Permalink
Greatly simplify the ~ detection.
Browse files Browse the repository at this point in the history
This code is equivalent to the old algorithm, except we use the
power of regexes in the lexer.

We no longer "manufacture" tokens in word.TildeDetect().  Instead we
just shuffle them between a LiteralPart and a TildeSubPart.

All unit tests and spec tests pass.
  • Loading branch information
Andy Chu committed Aug 29, 2018
1 parent 69d24d3 commit f55817c
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 78 deletions.
2 changes: 1 addition & 1 deletion core/id_kind.py
Expand Up @@ -152,7 +152,7 @@ def AddKinds(spec):
# Either brace expansion or keyword for { and }
'LBrace', 'RBrace', 'Comma',
'DRightBracket', # the ]] that matches [[, NOT a keyword
'Tilde', # tilde expansion
'TildeLike', # tilde expansion
'Pound', # for comment or VAROP state
'Slash', 'Percent', # / # % for patsub, NOT unary op
'Digits', # for lex_mode_e.ARITH
Expand Down
97 changes: 30 additions & 67 deletions core/word.py
Expand Up @@ -279,78 +279,41 @@ def RightMostSpanForWord(w):
# We only detect ~Lit_Chars and split. So we might as well just write a regex.

def TildeDetect(word):
"""Detect tilde expansion.
If it needs to include a TildeSubPart, return a new word. Otherwise return
None.
NOTE: This algorithm would be a simpler if
1. We could assume some regex for user names.
2. We didn't need to do brace expansion first, like {~foo,~bar}
OR
- If Lit_Slash were special (it is in the VAROP states, but not OUTER
state). We could introduce another lexer mode after you hit Lit_Tilde?
So we have to scan all LiteralPart instances until they contain a '/'.
http://unix.stackexchange.com/questions/157426/what-is-the-regex-to-validate-linux-users
"It is usually recommended to only use usernames that begin with a lower
case letter or an underscore, followed by lower case letters, digits,
underscores, or dashes. They can end with a dollar sign. In regular
expression terms: [a-z_][a-z0-9_-]*[$]?
On Debian, the only constraints are that usernames must neither start with
a dash ('-') nor contain a colon (':') or a whitespace (space: ' ', end
of line: '\n', tabulation: '\t', etc.). Note that using a slash ('/') may
break the default algorithm for the definition of the user's home
directory.
"""Detect tilde expansion in a word.
It might begin with LiteralPart that needs to be turned into a TildeSubPart.
(It depends on whether the second token begins with slash).
If so, it return a new word. Otherwise return None.
NOTE:
- The regex for Lit_TildeLike could be expanded. Right now it's
conservative, like Lit_Chars without the /.
- It's possible to write this in a mutating style, since only the first token
is changed. But note that we CANNOT know this during lexing.
"""
if not word.parts:
# NOTE: BracedWordTree, EmptyWord, etc. can't be tilde expanded
if word.tag != word_e.CompoundWord:
return None

assert word.parts, word

part0 = word.parts[0]
if _LiteralPartId(part0) != Id.Lit_Tilde:
if _LiteralPartId(part0) != Id.Lit_TildeLike:
return None

prefix = ''
found_slash = False
# search for the next /
for i in range(1, len(word.parts)):
# Not a literal part, and we did NOT find a slash. So there is no
# TildeSub applied. This would be something like ~X$var, ~$var,
# ~$(echo), etc.. The slash is necessary.
if word.parts[i].tag != word_part_e.LiteralPart:
return None
val = word.parts[i].token.val
p = val.find('/')

if p == -1: # no slash yet
prefix += val

elif p >= 0:
# e.g. for ~foo!bar/baz, extract "bar"
# NOTE: requires downcast to LiteralPart
pre, post = val[:p], val[p:]
prefix += pre
tilde_part = ast.TildeSubPart(prefix)
# NOTE: no span_id here. It would be nicer to use a different algorithm
# that didn't require this.
t = ast.token(Id.Lit_Chars, post, const.NO_INTEGER)
remainder_part = ast.LiteralPart(t)
found_slash = True
break

w = ast.CompoundWord()
if found_slash:
w.parts.append(tilde_part)
w.parts.append(remainder_part)
j = i + 1
while j < len(word.parts):
w.parts.append(word.parts[j])
j += 1
else:
# The whole thing is a tilde sub, e.g. ~foo or ~foo!bar
w.parts.append(ast.TildeSubPart(prefix))
return w
if len(word.parts) == 1: # can't be zero
tilde_part = ast.TildeSubPart(part0.token)
return ast.CompoundWord([tilde_part])

part1 = word.parts[1]
# NOTE: We could inspect the raw tokens.
if _LiteralPartId(part1) == Id.Lit_Chars and part1.token.val.startswith('/'):
tilde_part = ast.TildeSubPart(part0.token)
return ast.CompoundWord([tilde_part] + word.parts[1:])

# It could be something like '~foo:bar', which doesn't have a slash.
return None


def TildeDetectAll(words):
Expand Down
19 changes: 12 additions & 7 deletions core/word_eval.py
Expand Up @@ -158,27 +158,32 @@ def _EvalProcessSub(self, part, id_):
"""
raise NotImplementedError

def _EvalTildeSub(self, prefix):
def _EvalTildeSub(self, token):
"""Evaluates ~ and ~user.
Args:
prefix: The tilde prefix (possibly empty)
"""
if prefix == '':
if token.val == '~':
# First look up the HOME var, and then env var
val = self.mem.GetVar('HOME')
assert val.tag == value_e.Str, val
return val.s

# For ~otheruser/src. TODO: Should this be cached?
# http://linux.die.net/man/3/getpwnam
name = token.val[1:]
try:
e = pwd.getpwnam(prefix)
e = pwd.getpwnam(name)
except KeyError:
s = '~' + prefix
# If not found, it's ~nonexistente. TODO: In strict mode, this should be
# an error, kind of like failglob and nounset. Perhaps strict-tilde or
# even strict-word-eval.
result = token.val
else:
s = e.pw_dir
return s
result = e.pw_dir

return result

def _EvalVarNum(self, var_num):
assert var_num >= 0
Expand Down Expand Up @@ -754,7 +759,7 @@ def _EvalWordPart(self, part, part_vals, quoted=False):
elif part.tag == word_part_e.TildeSubPart:
# We never parse a quoted string into a TildeSubPart.
assert not quoted
s = self._EvalTildeSub(part.prefix)
s = self._EvalTildeSub(part.token)
v = runtime.StringPartValue(s, False)
part_vals.append(v)

Expand Down
6 changes: 4 additions & 2 deletions osh/lex.py
Expand Up @@ -196,6 +196,10 @@
# Id.Lit_Chars.
R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),

# For tilde expansion. The list of chars is Lit_Chars, but WITHOUT the /. We
# want the next token after the tilde TildeLike token start with a /.
R(r'~[a-zA-Z0-9_.-]*', Id.Lit_TildeLike),

C('#', Id.Lit_Pound), # For comments

# Needs to be LONGER than any other
Expand All @@ -207,8 +211,6 @@
C('{', Id.Lit_LBrace),
C('}', Id.Lit_RBrace), # Also for var sub ${a}
C(',', Id.Lit_Comma),
C('~', Id.Lit_Tilde), # For tilde expansion
# TODO: Add the rest of Lit_Chars minus / here.

R(r'[ \t\r]+', Id.WS_Space),

Expand Down
2 changes: 1 addition & 1 deletion osh/osh.asdl
Expand Up @@ -82,7 +82,7 @@ module osh
bracket_op? bracket_op
suffix_op? suffix_op)
-- This should be token tilde, token rest
| TildeSubPart(string prefix)
| TildeSubPart(token token)
-- For command sub and process sub: $(...) <(...) >(...)
| CommandSubPart(command command_list, token left_token)
| ArithSubPart(arith_expr anode)
Expand Down
8 changes: 8 additions & 0 deletions osh/word_parse_test.py
Expand Up @@ -313,6 +313,14 @@ def testArrayOp(self):
def testTestOp(self):
w = _assertReadWord(self, '${var:-default]}')

def testTildeLike(self):
w = _assertReadWord(self, '~/git/oilshell/oil')
w = _assertReadWord(self, '~andy/git/oilshell/oil')
w = _assertReadWord(self, '~andy_c/git/oilshell/oil')
w = _assertReadWord(self, '~andy.c/git/oilshell/oil')
w = _assertReadWord(self, '~andy-c/git/oilshell/oil')
w = _assertReadWord(self, '~andy-c:git/oilshell/oil')

def testRead(self):
CASES = [
'ls "foo"',
Expand Down

0 comments on commit f55817c

Please sign in to comment.