Permalink
Browse files

Split patsub and var-op-strip implementations into their own module.

Alpine Linux's abuild uses character classes in globs.  To support that
properly, I want to rewrite it without the Python regex module,.

- gold/*.sh: Test case for running abuild.
- osh/glob.asdl: Sketch out an LST schema for POSIX globs.  Not used.

Unrelated:

- Remove invalid state transition in core/legacy.py
  • Loading branch information...
Andy Chu
Andy Chu committed Jan 7, 2018
1 parent 201ed53 commit 8fcdd4233290e0018cef46fa4ca814ece2f11a0b
Showing with 213 additions and 123 deletions.
  1. +6 −2 core/legacy.py
  2. +149 −0 core/libstr.py
  3. +0 −2 core/runtime.asdl
  4. +5 −114 core/word_eval.py
  5. +6 −0 gold/strip-op-char-class.sh
  6. +40 −0 osh/glob.asdl
  7. +5 −5 scripts/count.sh
  8. +2 −0 test/gold.sh
View
@@ -241,13 +241,14 @@ def Split(self, s, allow_escape):
CH_DE_WHITE, CH_DE_GRAY, CH_BLACK, CH_BACKSLASH = range(4)
# Nodes are states
ST_START, ST_DE_WHITE1, ST_DE_GRAY, ST_DE_WHITE2, ST_BLACK, ST_BACKSLASH = range(6)
ST_INVALID, ST_START, ST_DE_WHITE1, ST_DE_GRAY, ST_DE_WHITE2, ST_BLACK, ST_BACKSLASH = range(7)
# Actions control what spans to emit.
EMIT_PART, EMIT_DE, EMIT_EMPTY, EMIT_ESCAPE, NO_EMIT = range(5)
TRANSITIONS = {
(ST_START, CH_DE_WHITE): (ST_DE_WHITE1, NO_EMIT), # ' '
# Whitespace should have been stripped
(ST_START, CH_DE_WHITE): (ST_INVALID, NO_EMIT), # ' '
(ST_START, CH_DE_GRAY): (ST_DE_GRAY, EMIT_EMPTY), # '_'
(ST_START, CH_BLACK): (ST_BLACK, NO_EMIT), # 'a'
(ST_START, CH_BACKSLASH): (ST_BACKSLASH, NO_EMIT), # '\'
@@ -335,6 +336,9 @@ def Split(self, s, allow_escape):
ch = CH_BLACK
new_state, action = TRANSITIONS[state, ch]
if new_state == ST_INVALID:
raise AssertionError(
'Invalid transition from %r with %r' % (state, ch))
#from core.util import log
#log('i %d c %r ch %s state %s new_state %s action %s', i, c, ch, state, new_state, action)
View
@@ -0,0 +1,149 @@
#!/usr/bin/python
"""
libstr.py - String library functions that can be exposed with a saner syntax.
Instead of
local y=${x//a*/b}
var y = x -> sub('a*', 'b', :ALL)
Or maybe:
var y = x -> sub( g/a*/, 'b', :ALL)
"""
from core import glob_
from core.id_kind import Id
# Implementation without Python regex:
#
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
# regexec. in a loop. fnmatch() does NOT given positions of matches.
#
# (2) Strip -- % %% # ## -
#
# a. Fast path for constant strings.
# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
# doesn't match, short circuit out? We can't do this with fnmatch.
# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
#
# - # shortest prefix - [:1], [:2], [:3] until it matches
# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
# match prefixes, it matches EXATLY.
# - % shortest suffix - [-1:] [-2:] [-3:] ...
# - %% longest suffix - [1:] [2:] [3:]
#
# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
# mksh. Dash doesn't implement it.
# TODO:
# - Unicode support: Convert both pattern, string, and replacement to unicode,
# then the result back at the end.
# - Add location info to errors. Maybe pass spid pair all the way down.
# - Compile time errors for [[:space:]] ?
def DoUnarySuffixOp(s, op, arg):
"""Helper for ${x#prefix} and family."""
pat_re, err = glob_.GlobToPythonRegex(arg)
if err:
e_die("Can't convert glob to regex: %r", arg)
if pat_re is None: # simple/fast path for fixed strings
if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
if s.startswith(arg):
return s[len(arg):]
else:
return s
elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
if s.endswith(arg):
# Mutate it so we preserve the flags.
return s[:-len(arg)]
else:
return s
else: # e.g. ^ ^^ , ,,
raise AssertionError(op.op_id)
else: # glob pattern
# Extract the group from the regex and return it
if op.op_id == Id.VOp1_Pound: # shortest prefix
# Need non-greedy match
pat_re2, err = glob_.GlobToPythonRegex(arg, greedy=False)
r = re.compile(pat_re2)
m = r.match(s)
if m:
return s[m.end(0):]
else:
return s
elif op.op_id == Id.VOp1_DPound: # longest prefix
r = re.compile(pat_re)
m = r.match(s)
if m:
return s[m.end(0):]
else:
return s
elif op.op_id == Id.VOp1_Percent: # shortest suffix
# NOTE: This is different than re.search, which will find the longest
# suffix.
r = re.compile('^(.*)' + pat_re + '$')
m = r.match(s)
if m:
return m.group(1)
else:
return s
elif op.op_id == Id.VOp1_DPercent: # longest suffix
r = re.compile('^(.*?)' + pat_re + '$') # non-greedy
m = r.match(s)
if m:
return m.group(1)
else:
return s
else:
raise AssertionError(op.op_id)
def PatSub(s, op, pat, replace_str):
"""Helper for ${x/pat/replace}."""
#log('PAT %r REPLACE %r', pat, replace_str)
py_regex, err = glob_.GlobToPythonRegex(pat)
if err:
e_die("Can't convert glob to regex: %r", pat)
if py_regex is None: # Simple/fast path for fixed strings
if op.do_all:
return s.replace(pat, replace_str)
elif op.do_prefix:
if s.startswith(pat):
n = len(pat)
return replace_str + s[n:]
else:
return s
elif op.do_suffix:
if s.endswith(pat):
n = len(pat)
return s[:-n] + replace_str
else:
return s
else:
return s.replace(pat, replace_str, 1) # just the first one
else:
count = 1 # replace first occurrence only
if op.do_all:
count = 0 # replace all
elif op.do_prefix:
py_regex = '^' + py_regex
elif op.do_suffix:
py_regex = py_regex + '$'
pat_re = re.compile(py_regex)
return pat_re.sub(replace_str, s, count)
View
@@ -52,8 +52,6 @@ module runtime
ProcessStatus(int status)
| PipelineStatus(int* statuses)
-- Word splitting in legacy.py
span = Black | Delim | Backslash
}
View
@@ -9,6 +9,7 @@
from core import braces
from core import expr_eval
from core import legacy
from core import libstr
from core import glob_
from core.id_kind import Id, Kind, LookupKind
from core import runtime
@@ -107,116 +108,6 @@ def _DecayPartValuesToString(part_vals, join_char):
return ''.join(out)
# TODO:
# - Unicode support: Convert both pattern, string, and replacement to unicode,
# then the result back at the end.
# - Add location info to errors. Maybe pass spid pair all the way down.
# - Compile time errors for [[:space:]] ?
def _DoUnarySuffixOp(s, op, arg):
"""Helper for ${x#prefix} and family."""
pat_re, err = glob_.GlobToPythonRegex(arg)
if err:
e_die("Can't convert glob to regex: %r", arg)
if pat_re is None: # simple/fast path for fixed strings
if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
if s.startswith(arg):
return s[len(arg):]
else:
return s
elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
if s.endswith(arg):
# Mutate it so we preserve the flags.
return s[:-len(arg)]
else:
return s
else: # e.g. ^ ^^ , ,,
raise AssertionError(op.op_id)
else: # glob pattern
# Extract the group from the regex and return it
if op.op_id == Id.VOp1_Pound: # shortest prefix
# Need non-greedy match
pat_re2, err = glob_.GlobToPythonRegex(arg, greedy=False)
r = re.compile(pat_re2)
m = r.match(s)
if m:
return s[m.end(0):]
else:
return s
elif op.op_id == Id.VOp1_DPound: # longest prefix
r = re.compile(pat_re)
m = r.match(s)
if m:
return s[m.end(0):]
else:
return s
elif op.op_id == Id.VOp1_Percent: # shortest suffix
# NOTE: This is different than re.search, which will find the longest
# suffix.
r = re.compile('^(.*)' + pat_re + '$')
m = r.match(s)
if m:
return m.group(1)
else:
return s
elif op.op_id == Id.VOp1_DPercent: # longest suffix
r = re.compile('^(.*?)' + pat_re + '$') # non-greedy
m = r.match(s)
if m:
return m.group(1)
else:
return s
else:
raise AssertionError(op.op_id)
def _PatSub(s, op, pat, replace_str):
"""Helper for ${x/pat/replace}."""
#log('PAT %r REPLACE %r', pat, replace_str)
py_regex, err = glob_.GlobToPythonRegex(pat)
if err:
e_die("Can't convert glob to regex: %r", pat)
if py_regex is None: # Simple/fast path for fixed strings
if op.do_all:
return s.replace(pat, replace_str)
elif op.do_prefix:
if s.startswith(pat):
n = len(pat)
return replace_str + s[n:]
else:
return s
elif op.do_suffix:
if s.endswith(pat):
n = len(pat)
return s[:-n] + replace_str
else:
return s
else:
return s.replace(pat, replace_str, 1) # just the first one
else:
count = 1 # replace first occurrence only
if op.do_all:
count = 0 # replace all
elif op.do_prefix:
py_regex = '^' + py_regex
elif op.do_suffix:
py_regex = py_regex + '$'
pat_re = re.compile(py_regex)
return pat_re.sub(replace_str, s, count)
# SliceParts is for ${a-} and ${a+}, Error is for ${a?}, and SliceAndAssign is
# for ${a=}.
@@ -432,13 +323,13 @@ def _ApplyUnarySuffixOp(self, val, op):
assert arg_val.tag == value_e.Str
if val.tag == value_e.Str:
s = _DoUnarySuffixOp(val.s, op, arg_val.s)
s = libstr.DoUnarySuffixOp(val.s, op, arg_val.s)
new_val = runtime.Str(s)
else: # val.tag == value_e.StrArray:
# ${a[@]#prefix} is VECTORIZED on arrays. Oil should have this too.
strs = []
for s in val.strs:
strs.append(_DoUnarySuffixOp(s, op, arg_val.s))
strs.append(libstr.DoUnarySuffixOp(s, op, arg_val.s))
new_val = runtime.StrArray(strs)
else:
@@ -653,13 +544,13 @@ def _EvalBracedVarSub(self, part, part_vals, quoted):
pat = pat_val.s
if val.tag == value_e.Str:
s = _PatSub(val.s, op, pat, replace_str)
s = libstr.PatSub(val.s, op, pat, replace_str)
val = runtime.Str(s)
elif val.tag == value_e.StrArray:
strs = []
for s in val.strs:
strs.append(_PatSub(s, op, pat, replace_str))
strs.append(libstr.PatSub(s, op, pat, replace_str))
val = runtime.StrArray(strs)
else:
@@ -0,0 +1,6 @@
#!/bin/bash
# Character classes in globs used by Alpine's abuild.
for d in 'python2-dev>=2.6' python3-dev flex bison bzip2-dev zlib-dev; do
echo ${d%%[<>=]*}
done
View
@@ -0,0 +1,40 @@
-- LST for globs. Globs must be dynamically parsed.
--
-- NOTE: This schema is currently unused. It would be useful for parsing
-- and translating globs to Python's regex engine, which supports non-greedy
-- matches. But we don't want to depend on Python regexes, so we use a
-- quadratic loop like bash/mksh. This is unfortunate, but strings are
-- generally short.
--
-- The schema could still be used for some kind of automatic glob translation.
-- Pattern Matching Notation:
-- http://pubs.opengroup.org/onlinepubs/9699919799/
--
-- RE Bracket Expression.
-- NOTE: negation is [!abc] for POSIX glob, but [^abc] for POSIX regexp!
-- http://pubs.opengroup.org/onlinepubs/9699919799/
module glob {
glob = (glob_part* parts)
-- Example: *.[ch] is Star, Literal('.'), CharClass(False, ...)
glob_part =
Literal(string s)
| EscapedChar(string c) -- \* \? \[
| Star -- * is 0 or more characters, like the regex .*
| QMark -- ? is a single character
| BracketExpr(bool negated, char_clause* clauses)
char_clause =
LiteralChar(string c)
-- NOTE: Name conflict with above. Should be namespaced.
| EscapedChar2(string c) -- \! \-
| CharRange(string begin, string end) -- a-z 0-9
| CharClass(string name) -- [:alpha:]
-- TODO:
-- * Collating symbols are [. .]
-- * Equivalence classes are [=
}
Oops, something went wrong.

0 comments on commit 8fcdd42

Please sign in to comment.