Skip to content

Commit

Permalink
8247546: Pattern matching does not skip correctly over supplementary …
Browse files Browse the repository at this point in the history
…characters

Reviewed-by: clanger
Backport-of: 4946a16
  • Loading branch information
cushon authored and RealCLanger committed Aug 16, 2022
1 parent 03a5c25 commit 38c632f
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 8 deletions.
15 changes: 9 additions & 6 deletions src/java.base/share/classes/java/util/regex/Pattern.java
Expand Up @@ -1048,9 +1048,10 @@ public final class Pattern
private transient int patternLength;

/**
* If the Start node might possibly match supplementary characters.
* If the Start node might possibly match supplementary or surrogate
* code points.
* It is set to true during compiling if
* (1) There is supplementary char in pattern, or
* (1) There is supplementary or surrogate code point in pattern, or
* (2) There is complement node of a "family" CharProperty
*/
private transient boolean hasSupplementary;
Expand Down Expand Up @@ -2930,8 +2931,10 @@ private CharProperty newCharProperty(CharPredicate p) {
return null;
if (p instanceof BmpCharPredicate)
return new BmpCharProperty((BmpCharPredicate)p);
else
else {
hasSupplementary = true;
return new CharProperty(p);
}
}

/**
Expand Down Expand Up @@ -5793,18 +5796,18 @@ private static boolean inRange(int lower, int ch, int upper) {
}

/**
* Charactrs within a explicit value range
* Characters within a explicit value range
*/
static CharPredicate Range(int lower, int upper) {
if (upper < Character.MIN_HIGH_SURROGATE ||
lower > Character.MAX_HIGH_SURROGATE &&
lower > Character.MAX_LOW_SURROGATE &&
upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
return ch -> inRange(lower, ch, upper);
}

/**
* Charactrs within a explicit value range in a case insensitive manner.
* Characters within a explicit value range in a case insensitive manner.
*/
static CharPredicate CIRange(int lower, int upper) {
return ch -> inRange(lower, ch, upper) ||
Expand Down
2 changes: 1 addition & 1 deletion test/jdk/java/util/regex/RegExTest.java
Expand Up @@ -35,7 +35,7 @@
* 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692
* 8194667 8197462 8184692 8247546
*
* @library /test/lib
* @build jdk.test.lib.RandomFactory
Expand Down
27 changes: 26 additions & 1 deletion test/jdk/java/util/regex/SupplementaryTestCases.txt
@@ -1,5 +1,5 @@
//
// Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -129,6 +129,31 @@ true \ud800\udc00pqr 0
///\ud800\udc00
///false 0

// unpaired surrogate should match
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
xxx\udca9\ud83dyyy
true \udca9 0

// surrogates in a supplementary character should not match
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
\ud83d\udca9
false 0

// unpaired surrogate should match
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
xxx\udca9\ud83dyyy
true \udca9 0

// surrogates part of a supplementary character should not match
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
\ud83d\udca9
false 0

// low surrogate part of a supplementary character should not match
[\x{dc00}-\x{dfff}]
\ud83d\udca9
false 0

// use of x modifier
\ud800\udc61bc(?x)bl\ud800\udc61h
\ud800\udc61bcbl\ud800\udc61h
Expand Down

1 comment on commit 38c632f

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.