Skip to content

Commit

Permalink
[regex] Parse quoted regex metacharacters as bash does.
Browse files Browse the repository at this point in the history
bash-completion relies on this odd behavior.

Also extract a gold test case and a benchmark from it
(testdata/parse-help/excerpt.sh).  The benchmark still needs to be
automated.
  • Loading branch information
Andy Chu committed Feb 4, 2019
1 parent 3958d87 commit e62f99c
Show file tree
Hide file tree
Showing 11 changed files with 571 additions and 8 deletions.
77 changes: 77 additions & 0 deletions benchmarks/parse-help.sh
@@ -0,0 +1,77 @@
#!/bin/bash
#
# A pure string-processing benchmark extracted from bash-completion.
#
# Usage:
# ./parse-help.sh <function name>

set -o nounset
set -o pipefail
set -o errexit

readonly DATA_DIR='testdata/parse-help'
readonly EXCERPT=testdata/parse-help/excerpt.sh

# TODO: Check these in to testdata/parse-help
collect() {
mkdir -p $DATA_DIR

ls --help > $DATA_DIR/ls.txt
~/.local/bin/mypy --help > $DATA_DIR/mypy.txt

wc -l $DATA_DIR/*
}

shorten() {
egrep '^[ ]+-' $DATA_DIR/ls.txt | head -n 2 | tee $DATA_DIR/ls-short.txt
}

run-cmd() {
local sh=$1
local cmd=$2
# read from stdin
time cat $DATA_DIR/$cmd.txt \
| $sh $EXCERPT _parse_help -
}

# Geez:
# ls mypy
# bash 25ms 25ms
# OSH 600ms 900ms There is a lot of variance here too.

# Well I guess that is 25x slower? It's a computationally expensive thing.
# Oh part of this is because printf is not a builtin! Doh.
#
# TODO
# - count the number of printf invocations. But you have to do it recursively!
# - Turn this into a proper benchmark with an HTML page.

all() {
wc -l $DATA_DIR/*

for sh in bash bin/osh; do
echo
echo "--- $sh --- "
echo

for cmd in ls-short ls mypy; do
run-cmd $sh $cmd >/dev/null
done
done
}

one() {
local sh='bin/osh'
local cmd='ls-short'
export PS4='+[${LINENO}:${FUNCNAME[0]}] '
time cat $DATA_DIR/$cmd.txt | $sh -x $EXCERPT _parse_help -
}

compare-one() {
local cmd='ls-short'
time cat $DATA_DIR/$cmd.txt | bin/osh $EXCERPT _parse_help -
echo ---
time cat $DATA_DIR/$cmd.txt | bash $EXCERPT _parse_help -
}

"$@"
2 changes: 1 addition & 1 deletion core/id_kind.py
Expand Up @@ -149,7 +149,7 @@ def AddKinds(spec):

spec.AddKind('Lit', [
'Chars', 'VarLike', 'ArrayLhsOpen', 'ArrayLhsClose',
'Other', 'EscapedChar',
'Other', 'EscapedChar', 'RegexMeta',
# Either brace expansion or keyword for { and }
'LBrace', 'RBrace', 'Comma',
'DRightBracket', # the ]] that matches [[, NOT a keyword
Expand Down
11 changes: 5 additions & 6 deletions frontend/lex.py
Expand Up @@ -374,15 +374,14 @@ def IsKeyword(name):
R(r'[a-zA-Z0-9_/-]+', Id.Lit_Chars), # not including period
R(r'[ \t\r]+', Id.WS_Space),

# From _BACKSLASH
# Normally, \x evalutes to x. But quoted regex metacharacters like \* should
# evaluate to \*. Compare with ( | ).
R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),

# Everything else is an escape.
R(r'\\[^\n\0]', Id.Lit_EscapedChar),
C('\\\n', Id.Ignored_LineCont),

#C('{', Id.Lit_RegexMeta), # { -> \{
#C('}', Id.Lit_RegexMeta), # } -> \}
# In [[ foo =~ foo$ ]], the $ doesn't get escaped
#C('$', Id.Lit_RegexMeta),

# NOTE: ( | and ) aren't operators!
R(r'[^\0]', Id.Lit_Other), # everything else is literal
]
Expand Down
1 change: 1 addition & 0 deletions osh/expr_eval.py
Expand Up @@ -734,6 +734,7 @@ def Eval(self, node):
return s1 != s2

if op_id == Id.BoolBinary_EqualTilde:
# TODO: This should go to --debug-file
#log('Matching %r against regex %r', s1, s2)
try:
matches = libc.regex_match(s2, s1)
Expand Down
5 changes: 4 additions & 1 deletion osh/word_eval.py
Expand Up @@ -680,7 +680,10 @@ def _EvalBracedVarSub(self, part, part_vals, quoted):

regex, warnings = glob_.GlobToERE(pat_val.s)
if warnings:
# TODO: Add strict mode and expose warnings.
# TODO:
# - Add 'set -o strict-glob' mode and expose warnings.
# "Glob is not in CANONICAL FORM".
# - Propagate location info back to the 'op.pat' word.
pass
replacer = string_ops.GlobReplacer(regex, replace_str, op.spids[0])

Expand Down
55 changes: 55 additions & 0 deletions spec/regex.test.sh
Expand Up @@ -91,6 +91,14 @@ pat="^(a b)$"
## OK zsh stdout: true
## OK zsh status: 0

#### Mixing quoted and unquoted parts
[[ 'a b' =~ 'a 'b ]] && echo true
[[ "a b" =~ "a "'b' ]] && echo true
## STDOUT:
true
true
## END

#### Regex with == and not =~ is parse error, different lexer mode required
# They both give a syntax error. This is lame.
[[ '^(a b)$' == ^(a\ b)$ ]] && echo true
Expand Down Expand Up @@ -125,6 +133,53 @@ pat="^(a b)$"
## N-I zsh stdout-json: ""
## N-I zsh status: 1

#### Regex to match literal brackets []

# bash-completion relies on this, so we're making it match bash.
# zsh understandably differs.
[[ '[]' =~ \[\] ]] && echo true

# Another way to write this.
pat='\[\]'
[[ '[]' =~ $pat ]] && echo true
## STDOUT:
true
true
## END
## OK zsh STDOUT:
true
## END

#### Regex to match literals . ^ $ etc.
[[ 'x' =~ \. ]] || echo false
[[ '.' =~ \. ]] && echo true

[[ 'xx' =~ \^\$ ]] || echo false
[[ '^$' =~ \^\$ ]] && echo true

[[ 'xxx' =~ \+\*\? ]] || echo false
[[ '*+?' =~ \*\+\? ]] && echo true

[[ 'xx' =~ \{\} ]] || echo false
[[ '{}' =~ \{\} ]] && echo true
## STDOUT:
false
true
false
true
false
true
false
true
## END
## BUG zsh STDOUT:
true
false
false
false
## END
## BUG zsh status: 1

#### Unquoted { is parse error in bash/zsh
[[ { =~ { ]] && echo true
echo status=$?
Expand Down
9 changes: 9 additions & 0 deletions test/gold.sh
Expand Up @@ -127,6 +127,13 @@ errexit-confusion() {
_compare gold/errexit-confusion.sh run-for-release-FIXED
}

parse-help() {
local dir=testdata/parse-help

# This is not hermetic since it calls 'ls'
_compare $dir/excerpt.sh _parse_help ls
}

readonly -a PASSING=(
# FLAKY: This one differs by timestamp
#version-text
Expand Down Expand Up @@ -158,6 +165,8 @@ readonly -a PASSING=(

errexit-confusion

parse-help

# This one takes a little long, but it's realistic.
#wild

Expand Down
77 changes: 77 additions & 0 deletions testdata/parse-help/excerpt.sh
@@ -0,0 +1,77 @@
#!/bin/bash
#
# A string processing test case copied from bash_completion.

# This function shell-quotes the argument
quote()
{
local quoted=${1//\'/\'\\\'\'}
printf "'%s'" "$quoted"
}

# This function shell-dequotes the argument
dequote()
{
eval printf %s "$1" 2> /dev/null
}

# Helper function for _parse_help and _parse_usage.
__parse_options()
{
local option option2 i IFS=$' \t\n,/|'

# Take first found long option, or first one (short) if not found.
option=
local -a array
read -a array <<<"$1"
for i in "${array[@]}"; do
case "$i" in
---*) break ;;
--?*) option=$i ; break ;;
-?*) [[ $option ]] || option=$i ;;
*) break ;;
esac
done
[[ $option ]] || return

IFS=$' \t\n' # affects parsing of the regexps below...

# Expand --[no]foo to --foo and --nofoo etc
if [[ $option =~ (\[((no|dont)-?)\]). ]]; then
option2=${option/"${BASH_REMATCH[1]}"/}
option2=${option2%%[<{().[]*}
printf '%s\n' "${option2/=*/=}"
option=${option/"${BASH_REMATCH[1]}"/"${BASH_REMATCH[2]}"}
fi
option=${option%%[<{().[]*}
printf '%s\n' "${option/=*/=}"
}
# Parse GNU style help output of the given command.
# @param $1 command; if "-", read from stdin and ignore rest of args
# @param $2 command options (default: --help)
#
_parse_help()
{
eval local cmd=$( quote "$1" )
local line
{ case $cmd in
-) cat ;;
*) LC_ALL=C "$( dequote "$cmd" )" ${2:---help} 2>&1 ;;
esac } \
| while read -r line; do
[[ $line == *([[:blank:]])-* ]] || continue
# transform "-f FOO, --foo=FOO" to "-f , --foo=FOO" etc
while [[ $line =~ \
((^|[^-])-[A-Za-z0-9?][[:space:]]+)\[?[A-Z0-9]+\]? ]]; do
line=${line/"${BASH_REMATCH[0]}"/"${BASH_REMATCH[1]}"}
done
__parse_options "${line// or /, }"
done
}
"$@"
2 changes: 2 additions & 0 deletions testdata/parse-help/ls-short.txt
@@ -0,0 +1,2 @@
-a, --all do not ignore entries starting with .
-A, --almost-all do not list implied . and ..

0 comments on commit e62f99c

Please sign in to comment.