Skip to content

Commit 401af27

Browse files
author
Xueming Shen
committed
8360459: UNICODE_CASE and character class with non-ASCII range does not match ASCII char
Reviewed-by: naoto
1 parent 38af17d commit 401af27

File tree

9 files changed

+2084
-5
lines changed

9 files changed

+2084
-5
lines changed

make/ToolsJdk.gmk

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
7878
TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
7979
build.tools.generateextraproperties.GenerateExtraProperties
8080

81+
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
82+
build.tools.generatecharacter.CaseFolding
83+
8184
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
8285
build.tools.makezipreproducible.MakeZipReproducible
8386

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation. Oracle designates this
8+
* particular file as subject to the "Classpath" exception as provided
9+
* by Oracle in the LICENSE file that accompanied this code.
10+
*
11+
* This code is distributed in the hope that it will be useful, but WITHOUT
12+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14+
* version 2 for more details (a copy is included in the LICENSE file that
15+
* accompanied this code).
16+
*
17+
* You should have received a copy of the GNU General Public License version
18+
* 2 along with this work; if not, write to the Free Software Foundation,
19+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20+
*
21+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22+
* or visit www.oracle.com if you need additional information or have any
23+
* questions.
24+
*/
25+
26+
package build.tools.generatecharacter;
27+
28+
import java.io.IOException;
29+
import java.nio.file.Files;
30+
import java.nio.file.Paths;
31+
import java.nio.file.StandardOpenOption;
32+
import java.util.stream.Collectors;
33+
import java.util.stream.Stream;
34+
35+
public class CaseFolding {
36+
37+
public static void main(String[] args) throws Throwable {
38+
if (args.length != 3) {
39+
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
40+
System.exit(1);
41+
}
42+
var templateFile = Paths.get(args[0]);
43+
var caseFoldingTxt = Paths.get(args[1]);
44+
var genSrcFile = Paths.get(args[2]);
45+
var supportedTypes = "^.*; [CTS]; .*$";
46+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
47+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
48+
.map(line -> {
49+
String[] cols = line.split("; ");
50+
return new String[] {cols[0], cols[1], cols[2]};
51+
})
52+
.filter(cols -> {
53+
// the folding case doesn't map back to the original char.
54+
var cp1 = Integer.parseInt(cols[0], 16);
55+
var cp2 = Integer.parseInt(cols[2], 16);
56+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
57+
})
58+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
59+
.collect(Collectors.joining(",\n", "", ""));
60+
61+
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
62+
// 0049; T; 0131; # LATIN CAPITAL LETTER I
63+
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
64+
65+
// Generate .java file
66+
Files.write(
67+
genSrcFile,
68+
Files.lines(templateFile)
69+
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
70+
.collect(Collectors.toList()),
71+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
72+
}
73+
}

make/modules/java.base/gensrc/GensrcRegex.gmk

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,22 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
5050

5151
################################################################################
5252

53+
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
54+
55+
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
56+
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
57+
58+
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
59+
$(call LogInfo, Generating $@)
60+
$(call MakeTargetDir)
61+
$(TOOL_GENERATECASEFOLDING) \
62+
$(CASEFOLDINGTEMP) \
63+
$(CASEFOLDINGTXT) \
64+
$(GENSRC_CASEFOLDING)
65+
66+
TARGETS += $(GENSRC_CASEFOLDING)
67+
68+
################################################################################
69+
5370
endif # include guard
5471
include MakeIncludeEnd.gmk

src/java.base/share/classes/java/util/regex/Pattern.java

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import java.util.stream.StreamSupport;
4545

4646
import jdk.internal.util.ArraysSupport;
47+
import jdk.internal.util.regex.CaseFolding;
4748
import jdk.internal.util.regex.Grapheme;
4849

4950
/**
@@ -2915,14 +2916,20 @@ private CharPredicate bitsOrSingle(BitClass bits, int ch) {
29152916
toLowerCase(u+212a) ==> u+006B
29162917
(6)AngstromSign u+212b
29172918
toLowerCase(u+212b) ==> u+00e5
2919+
(7) Latin Capital Letter Sharp S u+1e0e, was added in version 5.1
2920+
toLowerCase(u+1e9e) ==> u+00df
29182921
*/
29192922
if (ch < 256 &&
29202923
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
29212924
(ch == 0xff || ch == 0xb5 ||
29222925
ch == 0x49 || ch == 0x69 || //I and i
29232926
ch == 0x53 || ch == 0x73 || //S and s
29242927
ch == 0x4b || ch == 0x6b || //K and k
2925-
ch == 0xc5 || ch == 0xe5))) { //A+ring
2928+
ch == 0xc5 || ch == 0xe5 || //A+ring
2929+
// need to force single() to use SingleU specifically for u+00df.
2930+
// u+00df <-> u+1e9e, see https://codepoints.net/U+00DF.
2931+
// Character.toUpperCase('u+00df') still returns u+00df for now.
2932+
ch == 0xdf))) { // Shape S
29262933
bits.add(ch, flags0);
29272934
return null;
29282935
}
@@ -2939,7 +2946,7 @@ private CharPredicate single(final int ch) {
29392946
upper = Character.toUpperCase(ch);
29402947
lower = Character.toLowerCase(upper);
29412948
// Unicode case insensitive matches
2942-
if (upper != lower)
2949+
if (upper != lower || ch == 0xDF)
29432950
return SingleU(lower);
29442951
} else if (ASCII.isAscii(ch)) {
29452952
lower = ASCII.toLower(ch);
@@ -5960,12 +5967,29 @@ static CharPredicate CIRange(int lower, int upper) {
59605967
}
59615968

59625969
static CharPredicate CIRangeU(int lower, int upper) {
5970+
int[] closingCharacters = CaseFolding.getClassRangeClosingCharacters(lower, upper);
5971+
if (closingCharacters.length == 0) {
5972+
return ch -> {
5973+
if (inRange(lower, ch, upper))
5974+
return true;
5975+
int up = Character.toUpperCase(ch);
5976+
return (inRange(lower, up, upper) ||
5977+
inRange(lower, Character.toLowerCase(up), upper));
5978+
};
5979+
}
59635980
return ch -> {
59645981
if (inRange(lower, ch, upper))
59655982
return true;
59665983
int up = Character.toUpperCase(ch);
5967-
return inRange(lower, up, upper) ||
5968-
inRange(lower, Character.toLowerCase(up), upper);
5984+
int lo = Character.toLowerCase(up);
5985+
if (inRange(lower, up, upper) ||
5986+
inRange(lower, lo, upper))
5987+
return true;
5988+
for (int cp : closingCharacters) {
5989+
if (up == cp || lo == cp)
5990+
return true;
5991+
}
5992+
return false;
59695993
};
59705994
}
59715995

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation. Oracle designates this
8+
* particular file as subject to the "Classpath" exception as provided
9+
* by Oracle in the LICENSE file that accompanied this code.
10+
*
11+
* This code is distributed in the hope that it will be useful, but WITHOUT
12+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14+
* version 2 for more details (a copy is included in the LICENSE file that
15+
* accompanied this code).
16+
*
17+
* You should have received a copy of the GNU General Public License version
18+
* 2 along with this work; if not, write to the Free Software Foundation,
19+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20+
*
21+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22+
* or visit www.oracle.com if you need additional information or have any
23+
* questions.
24+
*/
25+
26+
package jdk.internal.util.regex;
27+
28+
import java.util.Arrays;
29+
import java.util.Map;
30+
import java.util.Objects;
31+
32+
import static java.util.Map.entry;
33+
34+
public final class CaseFolding {
35+
36+
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
37+
%%%Entries
38+
);
39+
40+
private static final int[] expanded_case_cps = expanded_case_map.keySet()
41+
.stream()
42+
.mapToInt(Integer::intValue)
43+
.toArray();
44+
45+
private CaseFolding() {}
46+
47+
/**
48+
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
49+
* matching, according to the
50+
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
51+
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
52+
* <p>
53+
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
54+
* be applied to literals and (optionally) to character classes. When applied to character classes, each
55+
* character class is expected to be closed under simple case folding. See the standard for the
56+
* detailed explanation and example of "closed".
57+
* <p>
58+
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
59+
* <ol>
60+
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
61+
* <li>Specify which character properties or constructs are closed under the matching.</li>
62+
* </ol>
63+
* <p>
64+
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
65+
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
66+
* family may appears independently or within a class.
67+
* <p>
68+
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
69+
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
70+
* matching.
71+
* <p>
72+
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
73+
* if their behavior is clearly specified.
74+
* <p>
75+
* This method addresses that requirement for the "range" construct within in character class by computing
76+
* the additional characters that should be included to close the range under simple case folding:
77+
* <p>
78+
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
79+
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
80+
* character is not already in the range, then that mapped character (typically lowercase) is added to
81+
* the expansion set.
82+
* <p>
83+
* This allows regex character class "range" implementation to use the returned expansion set to support
84+
* additional case-insensitive matching, without duplicating characters already covered by the existing
85+
* regex range implementation. The expectation is the matching is done using both the uppercase and
86+
* lowercase forms of the input character, for example
87+
*
88+
* <pre>{@code
89+
*
90+
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
91+
* inRange(lower, Character.toLower(ch), upper) ||
92+
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
93+
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
94+
* }</pre>
95+
*
96+
* <p>
97+
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
98+
* @param start the starting code point of the character range
99+
* @param end the ending code point of the character range
100+
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
101+
* those already in the range
102+
*/
103+
public static int[] getClassRangeClosingCharacters(int start, int end) {
104+
int[] expanded = new int[expanded_case_cps.length];
105+
int off = 0;
106+
for (int cp : expanded_case_cps) {
107+
if (cp >= start && cp <= end) {
108+
int folding = expanded_case_map.get(cp);
109+
if (folding < start || folding > end) {
110+
expanded[off++] = folding;
111+
}
112+
}
113+
}
114+
return Arrays.copyOf(expanded, off);
115+
}
116+
}

0 commit comments

Comments
 (0)