Skip to content

Commit f01cce2

Browse files
author
Ian Graves
committed
8264160: Regex \b is not consistent with \w without UNICODE_CHARACTER_CLASS
Reviewed-by: lancea, bpb, naoto
1 parent 634800a commit f01cce2

File tree

2 files changed

+91
-18
lines changed

2 files changed

+91
-18
lines changed

src/java.base/share/classes/java/util/regex/Pattern.java

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@
158158
* <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
159159
* <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
160160
* <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
161-
* <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
161+
* <td headers="matches predef digit">A digit: {@code [0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
162+
* * UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>.</td></tr>
162163
* <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
163164
* <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
164165
* <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
@@ -167,7 +168,9 @@
167168
* <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
168169
* <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
169170
* <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
170-
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
171+
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]} if
172+
* <a href="#UNICODE_CHARACTER_CLASS"> UNICODE_CHARACTER_CLASS</a> is not set. See
173+
* <a href="#unicodesupport">Unicode Support</a>.</td></tr>
171174
* <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
172175
* <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
173176
* <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
@@ -176,7 +179,8 @@
176179
* <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th>
177180
* <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr>
178181
* <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th>
179-
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr>
182+
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
183+
* UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>. </td></tr>
180184
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th>
181185
* <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr>
182186
*
@@ -246,11 +250,12 @@
246250
* <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th>
247251
* <td headers="matches bounds end_line">The end of a line</td></tr>
248252
* <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th>
249-
* <td headers="matches bounds word_boundary">A word boundary</td></tr>
253+
* <td headers="matches bounds word_boundary">A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location
254+
* where a non-word character abuts a word character)</td></tr>
250255
* <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th>
251256
* <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr>
252257
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th>
253-
* <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr>
258+
* <td headers="matches bounds non_word_boundary">A non-word boundary: {@code [^\b]}</td></tr>
254259
* <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th>
255260
* <td headers="matches bounds begin_input">The beginning of the input</td></tr>
256261
* <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th>
@@ -535,7 +540,7 @@
535540
* that do not capture text and do not count towards the group total, or
536541
* <i>named-capturing</i> group.
537542
*
538-
* <h2> Unicode support </h2>
543+
* <h2 id="unicodesupport"> Unicode support </h2>
539544
*
540545
* <p> This class is in conformance with Level 1 of <a
541546
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
@@ -5377,7 +5382,7 @@ static final class Bound extends Node {
53775382

53785383
boolean isWord(int ch) {
53795384
return useUWORD ? CharPredicates.WORD().is(ch)
5380-
: (ch == '_' || Character.isLetterOrDigit(ch));
5385+
: CharPredicates.ASCII_WORD().is(ch);
53815386
}
53825387

53835388
int check(Matcher matcher, int i, CharSequence seq) {

test/jdk/java/util/regex/RegExTest.java

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
3737
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
3838
* 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694
39-
*
39+
* 8280403 8264160 8281315
4040
* @library /test/lib
4141
* @library /lib/testlibrary/java/lang
4242
* @build jdk.test.lib.RandomFactory
@@ -51,14 +51,9 @@
5151
import java.nio.file.Files;
5252
import java.nio.file.Path;
5353
import java.nio.file.Paths;
54-
import java.util.ArrayList;
55-
import java.util.Arrays;
56-
import java.util.HashMap;
57-
import java.util.List;
58-
import java.util.Map;
59-
import java.util.Random;
60-
import java.util.Scanner;
54+
import java.util.*;
6155
import java.util.function.Function;
56+
import java.util.function.IntFunction;
6257
import java.util.function.Predicate;
6358
import java.util.regex.Matcher;
6459
import java.util.regex.MatchResult;
@@ -3854,11 +3849,11 @@ public static void unicodeClassesTest() {
38543849
}
38553850

38563851
// bounds/word align
3857-
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
3852+
twoFindIndexes(" \u0180sherman\u0400 ", boundU, 1, 10);
38583853
assertTrue(bwbU.reset("\u0180sherman\u0400").matches());
3859-
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
3854+
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", boundU, 1, 11);
38603855
assertTrue(bwbU.reset("\u0180sh\u0345erman\u0400").matches());
3861-
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
3856+
twoFindIndexes(" \u0724\u0739\u0724 ", boundU, 1, 4);
38623857
assertTrue(bwbU.reset("\u0724\u0739\u0724").matches());
38633858
assertTrue(bwbEU.reset("\u0724\u0739\u0724").matches());
38643859
}
@@ -4503,6 +4498,8 @@ public static void surrogatePairOverlapRegion() {
45034498
}
45044499

45054500
//This test is for 8037397
4501+
//Ensure we don't drop nested interior character classes to the right of an
4502+
//intersection operator.
45064503
@Test
45074504
public static void droppedClassesWithIntersection() {
45084505
String rx = "[A-Z&&[A-Z]0-9]";
@@ -4530,6 +4527,9 @@ public static void droppedClassesWithIntersection() {
45304527
}
45314528

45324529
//This test is for 8269753
4530+
//This is for ensuring that the caret doesn't point at the wrong character
4531+
//in a syntax exception message because we previously didn't compensate for
4532+
//tabs when rendering the offending string that contained tab characters.
45334533
@Test
45344534
public static void errorMessageCaretIndentation() {
45354535
String pattern = "\t**";
@@ -4540,6 +4540,8 @@ public static void errorMessageCaretIndentation() {
45404540
}
45414541

45424542
//This test is for 8276694
4543+
//Ensure our error message indicates we have an unescaped backslash when we
4544+
//encounter one.
45434545
@Test
45444546
public static void unescapedBackslash() {
45454547
String pattern = "\\";
@@ -4549,6 +4551,7 @@ public static void unescapedBackslash() {
45494551
}
45504552

45514553
//This test is for 8280403
4554+
//Given bad intersection syntax, we should throw a PatternSyntaxException.
45524555
@Test
45534556
public static void badIntersectionSyntax() {
45544557
String pattern = "[˜\\H +F&&]";
@@ -4557,7 +4560,70 @@ public static void badIntersectionSyntax() {
45574560
assertTrue(e.getMessage().contains("Bad intersection syntax"));
45584561
}
45594562

4563+
//This test is for 8264160
4564+
//Here we check for inconsistencies between the behavior of \w and the
4565+
//behavior of \b. Prior to this fix, the two flags did not behave in a
4566+
//consistent way ie \b would recognize non-\w characters as part of a word
4567+
//in some cases. This test verifies that the two behave consistently
4568+
//for all codepoints we support.
4569+
@Test
4570+
public static void wordBoundaryInconsistencies() {
4571+
Pattern basicWordCharPattern = Pattern.compile("\\w");
4572+
Pattern basicWordCharBoundaryPattern =
4573+
Pattern.compile(";\\b.", Pattern.DOTALL);
4574+
4575+
Pattern unicodeWordCharPattern =
4576+
Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS);
4577+
4578+
Pattern unicodeWordCharBoundaryPattern =
4579+
Pattern.compile(";\\b.",
4580+
Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);
4581+
4582+
IntFunction<Boolean> basicWordCharCheck =
4583+
(cp) -> cpMatches(basicWordCharPattern, cp, false);
4584+
4585+
IntFunction<Boolean> basicBoundaryCharCheck =
4586+
(cp) -> cpMatches(basicWordCharBoundaryPattern,
4587+
cp, true);
4588+
4589+
IntFunction<Boolean> unicodeWordCharCheck =
4590+
(cp) -> cpMatches(unicodeWordCharPattern, cp, false);
4591+
4592+
IntFunction<Boolean> unicodeBoundaryCharCheck =
4593+
(cp) -> cpMatches(unicodeWordCharBoundaryPattern,
4594+
cp,true);
4595+
4596+
//basic pattern comparison
4597+
for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){
4598+
assertEquals(basicWordCharCheck.apply(cp),
4599+
basicBoundaryCharCheck.apply(cp),
4600+
"Codepoint: " + cp);
4601+
assertEquals(unicodeWordCharCheck.apply(cp),
4602+
unicodeBoundaryCharCheck.apply(cp),
4603+
"Codepoint: " + cp);
4604+
}
4605+
}
4606+
4607+
private static boolean cpMatches(Pattern p, int cp, boolean boundary) {
4608+
String cpString;
4609+
if (Character.isBmpCodePoint(cp)) {
4610+
cpString = "" + ((char) cp);
4611+
} else {
4612+
cpString = "" + Character.highSurrogate(cp) +
4613+
Character.lowSurrogate(cp);
4614+
}
4615+
4616+
if (boundary) {
4617+
return p.matcher(";" + cpString).matches();
4618+
} else {
4619+
return p.matcher(cpString).matches();
4620+
}
4621+
}
4622+
45604623
//This test is for 8281560
4624+
//Checks that when the Canonical Equivalence flag is set, the behavior for
4625+
//Matcher::hitEnd is equivalent for these similar, patterns that saw
4626+
//inconsistencies.
45614627
@Test
45624628
public static void prematureHitEndInNFCCharProperty() {
45634629
var testInput = "a1a1";
@@ -4582,6 +4648,8 @@ public static void prematureHitEndInNFCCharProperty() {
45824648
}
45834649

45844650
//This test is for 8281315
4651+
//Checks that we are able to correctly match this case with a backref
4652+
//without encountering an IndexOutOfBoundsException.
45854653
@Test
45864654
public static void iOOBForCIBackrefs(){
45874655
String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";

0 commit comments

Comments
 (0)