Permalink
Please sign in to comment.
Browse files
Split patsub and var-op-strip implementations into their own module.
Alpine Linux's abuild uses character classes in globs. To support that properly, I want to rewrite it without the Python regex module,. - gold/*.sh: Test case for running abuild. - osh/glob.asdl: Sketch out an LST schema for POSIX globs. Not used. Unrelated: - Remove invalid state transition in core/legacy.py
- Loading branch information...
Showing
with
213 additions
and 123 deletions.
- +6 −2 core/legacy.py
- +149 −0 core/libstr.py
- +0 −2 core/runtime.asdl
- +5 −114 core/word_eval.py
- +6 −0 gold/strip-op-char-class.sh
- +40 −0 osh/glob.asdl
- +5 −5 scripts/count.sh
- +2 −0 test/gold.sh
| @@ -0,0 +1,149 @@ | ||
| #!/usr/bin/python | ||
| """ | ||
| libstr.py - String library functions that can be exposed with a saner syntax. | ||
| Instead of | ||
| local y=${x//a*/b} | ||
| var y = x -> sub('a*', 'b', :ALL) | ||
| Or maybe: | ||
| var y = x -> sub( g/a*/, 'b', :ALL) | ||
| """ | ||
| from core import glob_ | ||
| from core.id_kind import Id | ||
| # Implementation without Python regex: | ||
| # | ||
| # (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and | ||
| # regexec. in a loop. fnmatch() does NOT given positions of matches. | ||
| # | ||
| # (2) Strip -- % %% # ## - | ||
| # | ||
| # a. Fast path for constant strings. | ||
| # b. Convert to POSIX extended regex, to see if it matches at ALL. If it | ||
| # doesn't match, short circuit out? We can't do this with fnmatch. | ||
| # c. If it does match, call fnmatch() iteratively over prefixes / suffixes. | ||
| # | ||
| # - # shortest prefix - [:1], [:2], [:3] until it matches | ||
| # - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not | ||
| # match prefixes, it matches EXATLY. | ||
| # - % shortest suffix - [-1:] [-2:] [-3:] ... | ||
| # - %% longest suffix - [1:] [2:] [3:] | ||
| # | ||
| # See remove_pattern() in subst.c for bash, and trimsub() in eval.c for | ||
| # mksh. Dash doesn't implement it. | ||
| # TODO: | ||
| # - Unicode support: Convert both pattern, string, and replacement to unicode, | ||
| # then the result back at the end. | ||
| # - Add location info to errors. Maybe pass spid pair all the way down. | ||
| # - Compile time errors for [[:space:]] ? | ||
| def DoUnarySuffixOp(s, op, arg): | ||
| """Helper for ${x#prefix} and family.""" | ||
| pat_re, err = glob_.GlobToPythonRegex(arg) | ||
| if err: | ||
| e_die("Can't convert glob to regex: %r", arg) | ||
| if pat_re is None: # simple/fast path for fixed strings | ||
| if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix | ||
| if s.startswith(arg): | ||
| return s[len(arg):] | ||
| else: | ||
| return s | ||
| elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix | ||
| if s.endswith(arg): | ||
| # Mutate it so we preserve the flags. | ||
| return s[:-len(arg)] | ||
| else: | ||
| return s | ||
| else: # e.g. ^ ^^ , ,, | ||
| raise AssertionError(op.op_id) | ||
| else: # glob pattern | ||
| # Extract the group from the regex and return it | ||
| if op.op_id == Id.VOp1_Pound: # shortest prefix | ||
| # Need non-greedy match | ||
| pat_re2, err = glob_.GlobToPythonRegex(arg, greedy=False) | ||
| r = re.compile(pat_re2) | ||
| m = r.match(s) | ||
| if m: | ||
| return s[m.end(0):] | ||
| else: | ||
| return s | ||
| elif op.op_id == Id.VOp1_DPound: # longest prefix | ||
| r = re.compile(pat_re) | ||
| m = r.match(s) | ||
| if m: | ||
| return s[m.end(0):] | ||
| else: | ||
| return s | ||
| elif op.op_id == Id.VOp1_Percent: # shortest suffix | ||
| # NOTE: This is different than re.search, which will find the longest | ||
| # suffix. | ||
| r = re.compile('^(.*)' + pat_re + '$') | ||
| m = r.match(s) | ||
| if m: | ||
| return m.group(1) | ||
| else: | ||
| return s | ||
| elif op.op_id == Id.VOp1_DPercent: # longest suffix | ||
| r = re.compile('^(.*?)' + pat_re + '$') # non-greedy | ||
| m = r.match(s) | ||
| if m: | ||
| return m.group(1) | ||
| else: | ||
| return s | ||
| else: | ||
| raise AssertionError(op.op_id) | ||
| def PatSub(s, op, pat, replace_str): | ||
| """Helper for ${x/pat/replace}.""" | ||
| #log('PAT %r REPLACE %r', pat, replace_str) | ||
| py_regex, err = glob_.GlobToPythonRegex(pat) | ||
| if err: | ||
| e_die("Can't convert glob to regex: %r", pat) | ||
| if py_regex is None: # Simple/fast path for fixed strings | ||
| if op.do_all: | ||
| return s.replace(pat, replace_str) | ||
| elif op.do_prefix: | ||
| if s.startswith(pat): | ||
| n = len(pat) | ||
| return replace_str + s[n:] | ||
| else: | ||
| return s | ||
| elif op.do_suffix: | ||
| if s.endswith(pat): | ||
| n = len(pat) | ||
| return s[:-n] + replace_str | ||
| else: | ||
| return s | ||
| else: | ||
| return s.replace(pat, replace_str, 1) # just the first one | ||
| else: | ||
| count = 1 # replace first occurrence only | ||
| if op.do_all: | ||
| count = 0 # replace all | ||
| elif op.do_prefix: | ||
| py_regex = '^' + py_regex | ||
| elif op.do_suffix: | ||
| py_regex = py_regex + '$' | ||
| pat_re = re.compile(py_regex) | ||
| return pat_re.sub(replace_str, s, count) | ||
| @@ -0,0 +1,6 @@ | ||
| #!/bin/bash | ||
| # Character classes in globs used by Alpine's abuild. | ||
| for d in 'python2-dev>=2.6' python3-dev flex bison bzip2-dev zlib-dev; do | ||
| echo ${d%%[<>=]*} | ||
| done |
| @@ -0,0 +1,40 @@ | ||
| -- LST for globs. Globs must be dynamically parsed. | ||
| -- | ||
| -- NOTE: This schema is currently unused. It would be useful for parsing | ||
| -- and translating globs to Python's regex engine, which supports non-greedy | ||
| -- matches. But we don't want to depend on Python regexes, so we use a | ||
| -- quadratic loop like bash/mksh. This is unfortunate, but strings are | ||
| -- generally short. | ||
| -- | ||
| -- The schema could still be used for some kind of automatic glob translation. | ||
| -- Pattern Matching Notation: | ||
| -- http://pubs.opengroup.org/onlinepubs/9699919799/ | ||
| -- | ||
| -- RE Bracket Expression. | ||
| -- NOTE: negation is [!abc] for POSIX glob, but [^abc] for POSIX regexp! | ||
| -- http://pubs.opengroup.org/onlinepubs/9699919799/ | ||
| module glob { | ||
| glob = (glob_part* parts) | ||
| -- Example: *.[ch] is Star, Literal('.'), CharClass(False, ...) | ||
| glob_part = | ||
| Literal(string s) | ||
| | EscapedChar(string c) -- \* \? \[ | ||
| | Star -- * is 0 or more characters, like the regex .* | ||
| | QMark -- ? is a single character | ||
| | BracketExpr(bool negated, char_clause* clauses) | ||
| char_clause = | ||
| LiteralChar(string c) | ||
| -- NOTE: Name conflict with above. Should be namespaced. | ||
| | EscapedChar2(string c) -- \! \- | ||
| | CharRange(string begin, string end) -- a-z 0-9 | ||
| | CharClass(string name) -- [:alpha:] | ||
| -- TODO: | ||
| -- * Collating symbols are [. .] | ||
| -- * Equivalence classes are [= | ||
| } |
Oops, something went wrong.
0 comments on commit
8fcdd42