/
regex.test.sh
230 lines (202 loc) · 5.36 KB
/
regex.test.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env bash
#
# Only bash and zsh seem to implement [[ foo =~ '' ]]
#
# ^(a b)$ is a regex that should match 'a b' in a group.
#
# Not sure what bash is doing here... I think I have to just be empirical.
# Might need "compat" switch for parsing the regex. It should be an opaque
# string like zsh, not sure why it isn't.
#
# I think this is just papering over bugs...
# https://www.gnu.org/software/bash/manual/bash.html#Conditional-Constructs
#
# Storing the regular expression in a shell variable is often a useful way to
# avoid problems with quoting characters that are special to the shell. It is
# sometimes difficult to specify a regular expression literally without using
# quotes, or to keep track of the quoting used by regular expressions while
# paying attention to the shell’s quote removal. Using a shell variable to
# store the pattern decreases these problems. For example, the following is
# equivalent to the above:
#
# pattern='[[:space:]]*(a)?b'
# [[ $line =~ $pattern ]]
#
# If you want to match a character that’s special to the regular expression
# grammar, it has to be quoted to remove its special meaning. This means that in
# the pattern ‘xxx.txt’, the ‘.’ matches any character in the string (its usual
# regular expression meaning), but in the pattern ‘"xxx.txt"’ it can only match a
# literal ‘.’. Shell programmers should take special care with backslashes, since
# backslashes are used both by the shell and regular expressions to remove the
# special meaning from the following character. The following two sets of
# commands are not equivalent:
#
# From bash code: ( | ) are treated special. Normally they must be quoted, but
# they can be UNQUOTED in BASH_REGEX state. In fact they can't be quoted!
#### BASH_REMATCH
[[ foo123 =~ ([a-z]+)([0-9]+) ]]
argv.py "${BASH_REMATCH[@]}"
## STDOUT:
['foo123', 'foo', '123']
## END
## N-I zsh STDOUT:
['']
## END
#### Match is unanchored at both ends
[[ 'bar' =~ a ]] && echo true
## stdout: true
#### Failed match
[[ 'bar' =~ X ]] && echo true
## status: 1
## stdout-json: ""
#### Regex quoted with \ -- preferred in bash
[[ 'a b' =~ ^(a\ b)$ ]] && echo true
## stdout: true
#### Regex quoted with single quotes
# bash doesn't like the quotes
[[ 'a b' =~ '^(a b)$' ]] && echo true
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0
#### Regex quoted with double quotes
# bash doesn't like the quotes
[[ 'a b' =~ "^(a b)$" ]] && echo true
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0
#### Fix single quotes by storing in variable
pat='^(a b)$'
[[ 'a b' =~ $pat ]] && echo true
## stdout: true
#### Fix single quotes by storing in variable
pat="^(a b)$"
[[ 'a b' =~ $pat ]] && echo true
## stdout: true
#### Double quoting pat variable -- again bash doesn't like it.
pat="^(a b)$"
[[ 'a b' =~ "$pat" ]] && echo true
## stdout-json: ""
## status: 1
## OK zsh stdout: true
## OK zsh status: 0
#### Mixing quoted and unquoted parts
[[ 'a b' =~ 'a 'b ]] && echo true
[[ "a b" =~ "a "'b' ]] && echo true
## STDOUT:
true
true
## END
#### Regex with == and not =~ is parse error, different lexer mode required
# They both give a syntax error. This is lame.
[[ '^(a b)$' == ^(a\ b)$ ]] && echo true
## status: 2
## OK zsh status: 1
#### Omitting ( )
[[ '^a b$' == ^a\ b$ ]] && echo true
## stdout: true
#### Malformed regex
# Are they trying to PARSE the regex? Do they feed the buffer directly to
# regcomp()?
[[ 'a b' =~ ^)a\ b($ ]] && echo true
## status: 2
## OK zsh status: 1
#### Regex with char class containing space
# For some reason it doesn't work without parens?
[[ 'ba ba ' =~ ([a b]+) ]] && echo true
## stdout: true
#### Operators and space lose meaning inside ()
[[ '< >' =~ (< >) ]] && echo true
## stdout: true
## N-I zsh stdout-json: ""
## N-I zsh status: 1
#### Regex with |
[[ 'bar' =~ foo|bar ]] && echo true
## stdout: true
## N-I zsh stdout-json: ""
## N-I zsh status: 1
#### Regex to match literal brackets []
# bash-completion relies on this, so we're making it match bash.
# zsh understandably differs.
[[ '[]' =~ \[\] ]] && echo true
# Another way to write this.
pat='\[\]'
[[ '[]' =~ $pat ]] && echo true
## STDOUT:
true
true
## END
## OK zsh STDOUT:
true
## END
#### Regex to match literals . ^ $ etc.
[[ 'x' =~ \. ]] || echo false
[[ '.' =~ \. ]] && echo true
[[ 'xx' =~ \^\$ ]] || echo false
[[ '^$' =~ \^\$ ]] && echo true
[[ 'xxx' =~ \+\*\? ]] || echo false
[[ '*+?' =~ \*\+\? ]] && echo true
[[ 'xx' =~ \{\} ]] || echo false
[[ '{}' =~ \{\} ]] && echo true
## STDOUT:
false
true
false
true
false
true
false
true
## END
## BUG zsh STDOUT:
true
false
false
false
## END
## BUG zsh status: 1
#### Unquoted { is parse error in bash/zsh
[[ { =~ { ]] && echo true
echo status=$?
## STDOUT:
status=2
## END
## N-I zsh STDOUT:
status=1
## END
#### Quoted {
[[ { =~ "{" ]] && echo true
echo status=$?
## STDOUT:
true
status=0
## END
## N-I zsh STDOUT:
status=1
## END
#### Escaped {
# from bash-completion
[[ '$PA' =~ ^(\$\{?)([A-Za-z0-9_]*)$ ]] && argv.py "${BASH_REMATCH[@]}"
## STDOUT:
['$PA', '$', 'PA']
## END
## BUG zsh stdout-json: ""
## BUG zsh status: 1
#### Escaped { stored in variable first
# from bash-completion
pat='^(\$\{?)([A-Za-z0-9_]*)$'
[[ '$PA' =~ $pat ]] && argv.py "${BASH_REMATCH[@]}"
## STDOUT:
['$PA', '$', 'PA']
## END
## BUG zsh STDOUT:
['']
## END
#### regex with ?
[[ 'c' =~ c? ]] && echo true
[[ '' =~ c? ]] && echo true
## STDOUT:
true
true
## END