Skip to content

Commit 38c632f

Browse files
cushonRealCLanger
authored andcommitted
8247546: Pattern matching does not skip correctly over supplementary characters
Reviewed-by: clanger Backport-of: 4946a16
1 parent 03a5c25 commit 38c632f

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed

src/java.base/share/classes/java/util/regex/Pattern.java

+9-6
Original file line numberDiff line numberDiff line change
@@ -1048,9 +1048,10 @@ public final class Pattern
10481048
private transient int patternLength;
10491049

10501050
/**
1051-
* If the Start node might possibly match supplementary characters.
1051+
* If the Start node might possibly match supplementary or surrogate
1052+
* code points.
10521053
* It is set to true during compiling if
1053-
* (1) There is supplementary char in pattern, or
1054+
* (1) There is supplementary or surrogate code point in pattern, or
10541055
* (2) There is complement node of a "family" CharProperty
10551056
*/
10561057
private transient boolean hasSupplementary;
@@ -2930,8 +2931,10 @@ private CharProperty newCharProperty(CharPredicate p) {
29302931
return null;
29312932
if (p instanceof BmpCharPredicate)
29322933
return new BmpCharProperty((BmpCharPredicate)p);
2933-
else
2934+
else {
2935+
hasSupplementary = true;
29342936
return new CharProperty(p);
2937+
}
29352938
}
29362939

29372940
/**
@@ -5793,18 +5796,18 @@ private static boolean inRange(int lower, int ch, int upper) {
57935796
}
57945797

57955798
/**
5796-
* Charactrs within a explicit value range
5799+
* Characters within a explicit value range
57975800
*/
57985801
static CharPredicate Range(int lower, int upper) {
57995802
if (upper < Character.MIN_HIGH_SURROGATE ||
5800-
lower > Character.MAX_HIGH_SURROGATE &&
5803+
lower > Character.MAX_LOW_SURROGATE &&
58015804
upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
58025805
return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
58035806
return ch -> inRange(lower, ch, upper);
58045807
}
58055808

58065809
/**
5807-
* Charactrs within a explicit value range in a case insensitive manner.
5810+
* Characters within a explicit value range in a case insensitive manner.
58085811
*/
58095812
static CharPredicate CIRange(int lower, int upper) {
58105813
return ch -> inRange(lower, ch, upper) ||

test/jdk/java/util/regex/RegExTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
* 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819
3636
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
3737
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
38-
* 8194667 8197462 8184692
38+
* 8194667 8197462 8184692 8247546
3939
*
4040
* @library /test/lib
4141
* @build jdk.test.lib.RandomFactory

test/jdk/java/util/regex/SupplementaryTestCases.txt

+26-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved.
2+
// Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
33
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
//
55
// This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,31 @@ true \ud800\udc00pqr 0
129129
///\ud800\udc00
130130
///false 0
131131

132+
// unpaired surrogate should match
133+
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
134+
xxx\udca9\ud83dyyy
135+
true \udca9 0
136+
137+
// surrogates in a supplementary character should not match
138+
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
139+
\ud83d\udca9
140+
false 0
141+
142+
// unpaired surrogate should match
143+
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
144+
xxx\udca9\ud83dyyy
145+
true \udca9 0
146+
147+
// surrogates part of a supplementary character should not match
148+
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
149+
\ud83d\udca9
150+
false 0
151+
152+
// low surrogate part of a supplementary character should not match
153+
[\x{dc00}-\x{dfff}]
154+
\ud83d\udca9
155+
false 0
156+
132157
// use of x modifier
133158
\ud800\udc61bc(?x)bl\ud800\udc61h
134159
\ud800\udc61bcbl\ud800\udc61h

0 commit comments

Comments
 (0)