Skip to content

Commit 9740c24

Browse files
naotojJustin Lu
andcommitted
8301400: Allow additional characters for GB18030-2022 support
Co-authored-by: Justin Lu <jlu@openjdk.org> Reviewed-by: lancea, iris, rriggs Backport-of: a039f39ee8486e990a4b4341c9e07fc49d3dac2d
1 parent 3147b1b commit 9740c24

File tree

5 files changed

+76
-63
lines changed

5 files changed

+76
-63
lines changed

jdk/make/data/characterdata/CharacterData00.java.template

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -106,19 +106,23 @@ class CharacterData00 extends CharacterData {
106106

107107
boolean isJavaIdentifierStart(int ch) {
108108
// isJavaIdentifierStart strictly conforms to code points assigned
109-
// in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not
110-
// from Unicode 6.2, return false.
111-
if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF))
109+
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
110+
// {9FCD..9FEF} are not from Unicode 6.2, return false.
111+
if(ch == 0x32FF ||
112+
(ch>= 0x20BB && ch<= 0x20BF) ||
113+
(ch>= 0x9FCD && ch<= 0x9FEF))
112114
return false;
113115
int props = getProperties(ch);
114116
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
115117
}
116118

117119
boolean isJavaIdentifierPart(int ch) {
118120
// isJavaIdentifierPart strictly conforms to code points assigned
119-
// in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not
120-
// from Unicode 6.2, return false.
121-
if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF))
121+
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
122+
// {9FCD..9FEF} are not from Unicode 6.2, return false.
123+
if(ch == 0x32FF ||
124+
(ch>= 0x20BB && ch<= 0x20BF) ||
125+
(ch>= 0x9FCD && ch<= 0x9FEF))
122126
return false;
123127
int props = getProperties(ch);
124128
return ((props & $$nonzeroJavaPart) != 0);

jdk/make/data/unicodedata/UnicodeData.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11732,7 +11732,7 @@
1173211732
4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;;
1173311733
4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;;
1173411734
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11735-
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11735+
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1173611736
A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
1173711737
A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;;
1173811738
A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;;

jdk/src/share/classes/java/lang/Character.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -52,13 +52,17 @@
5252
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
5353
* <p>
5454
* The Java SE 8 Platform uses character information from version 6.2
55-
* of the Unicode Standard, with two extensions. First, the Java SE 8 Platform
56-
* allows an implementation of class {@code Character} to use the Japanese Era
57-
* code point, {@code U+32FF}, from the first version of the Unicode Standard
58-
* after 6.2 that assigns the code point. Second, in recognition of the fact
55+
* of the Unicode Standard, with three extensions. First, in recognition of the fact
5956
* that new currencies appear frequently, the Java SE 8 Platform allows an
6057
* implementation of class {@code Character} to use the Currency Symbols
61-
* block from version 10.0 of the Unicode Standard. Consequently, the
58+
* block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform
59+
* allows an implementation of class {@code Character} to use the code points
60+
* in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the
61+
* Unicode Standard, in order for the class to allow the "Implementation
62+
* Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform
63+
* allows an implementation of class {@code Character} to use the Japanese Era
64+
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
65+
* Consequently, the
6266
* behavior of fields and methods of class {@code Character} may vary across
6367
* implementations of the Java SE 8 Platform when processing the aforementioned
6468
* code points ( outside of version 6.2 ), except for the following methods

jdk/test/java/lang/Character/Scripts.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,15 +1433,15 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
14331433
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
14341434
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
14351435
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
1436-
4E00..9FCC ; Han # Lo [20941] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FCC
1436+
4E00..9FEF ; Han # Lo [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF
14371437
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
14381438
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
14391439
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
14401440
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
14411441
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
14421442
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
14431443

1444-
# Total code points: 75963
1444+
# Total code points: 75998
14451445

14461446
# ================================================
14471447

jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java

Lines changed: 52 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -28,22 +28,16 @@
2828
* @bug 8218915
2929
*/
3030

31-
import java.util.List;
32-
import java.util.ArrayList;
33-
3431
public class TestIsJavaIdentifierMethods {
35-
36-
// List of new code points are not present in Unicode 6.2.
37-
private static final List<Integer> UNASSIGNED_CODEPOINTS_IN_6_2
38-
= new ArrayList<Integer>()
39-
{{
40-
add(0x20BB); // NORDIC MARK SIGN
41-
add(0x20BC); // MANAT SIGN
42-
add(0x20BD); // RUBLE SIGN
43-
add(0x20BE); // LARI SIGN
44-
add(0x20BF); // BITCOIN SIGN
45-
add(0x32FF); // SQUARE ERA NAME NEWERA
46-
}};
32+
// Unassigned code points not present in Unicode 6.2 (which Java SE 8
33+
// is based upon), including: various currency symbol sign code points
34+
// (Nordic Mark ... Bitcoin), Japanese Era Square character code point,
35+
// and 35 CJK Unified Ideograph code points from GB18030-2022
36+
private static final int CS_SIGNS_CODEPOINT_START = 0x20BB;
37+
private static final int CS_SIGNS_CODEPOINT_END = 0x20BF;
38+
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
39+
private static final int GB18030_2022_CODEPOINT_START = 0x9FCD;
40+
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;
4741

4842
public static void main(String[] args) {
4943
testIsJavaIdentifierPart_int();
@@ -75,14 +69,15 @@ public static void main(String[] args) {
7569
public static void testIsJavaIdentifierPart_int() {
7670
for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
7771
boolean expected = false;
78-
7972
// Since Character.isJavaIdentifierPart(int) strictly conforms to
8073
// character information from version 6.2 of the Unicode Standard,
81-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
82-
// list. If the code point is found in list
83-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
84-
// "expected" is considered false.
85-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) {
74+
// check if code point is one of the extra unassigned
75+
// code points (defined at the beginning of the file). If the code
76+
// point is found to be one of the unassigned code points,
77+
// value of variable "expected" is considered false.
78+
if (cp != JAPANESE_ERA_CODEPOINT &&
79+
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
80+
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
8681
byte type = (byte) Character.getType(cp);
8782
expected = Character.isLetter(cp)
8883
|| type == Character.CURRENCY_SYMBOL
@@ -124,11 +119,13 @@ public static void testIsJavaIdentifierPart_char() {
124119
boolean expected = false;
125120
// Since Character.isJavaIdentifierPart(char) strictly conforms to
126121
// character information from version 6.2 of the Unicode Standard,
127-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
128-
// list. If the code point is found in list
129-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
130-
// "expected" is considered false.
131-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
122+
// check if code point is one of the extra unassigned
123+
// code points (defined at the beginning of the file). If the code
124+
// point is found to be one of the unassigned code points,
125+
// value of variable "expected" is considered false.
126+
if (i != JAPANESE_ERA_CODEPOINT &&
127+
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
128+
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
132129
byte type = (byte) Character.getType(ch);
133130
expected = Character.isLetter(ch)
134131
|| type == Character.CURRENCY_SYMBOL
@@ -165,11 +162,13 @@ public static void testIsJavaIdentifierStart_int() {
165162
boolean expected = false;
166163
// Since Character.isJavaIdentifierStart(int) strictly conforms to
167164
// character information from version 6.2 of the Unicode Standard,
168-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
169-
// list. If the code point is found in list
170-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
171-
// "expected" is considered false.
172-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) {
165+
// check if code point is one of the extra unassigned
166+
// code points (defined at the beginning of the file). If the code
167+
// point is found to be one of the unassigned code points,
168+
// value of variable "expected" is considered false.
169+
if (cp != JAPANESE_ERA_CODEPOINT &&
170+
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
171+
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
173172
byte type = (byte) Character.getType(cp);
174173
expected = Character.isLetter(cp)
175174
|| type == Character.LETTER_NUMBER
@@ -203,11 +202,13 @@ public static void testIsJavaIdentifierStart_char() {
203202
boolean expected = false;
204203
// Since Character.isJavaIdentifierStart(char) strictly conforms to
205204
// character information from version 6.2 of the Unicode Standard,
206-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
207-
// list. If the code point is found in list
208-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
209-
// "expected" is considered false.
210-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
205+
// check if code point is one of the extra unassigned
206+
// code points (defined at the beginning of the file). If the code
207+
// point is found to be one of the unassigned code points,
208+
// value of variable "expected" is considered false.
209+
if (i != JAPANESE_ERA_CODEPOINT &&
210+
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
211+
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
211212
byte type = (byte) Character.getType(ch);
212213
expected = Character.isLetter(ch)
213214
|| type == Character.LETTER_NUMBER
@@ -241,11 +242,13 @@ public static void testIsJavaLetter() {
241242
boolean expected = false;
242243
// Since Character.isJavaLetter(char) strictly conforms to
243244
// character information from version 6.2 of the Unicode Standard,
244-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
245-
// list. If the code point is found in list
246-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
247-
// "expected" is considered false.
248-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
245+
// check if code point is one of the extra unassigned
246+
// code points (defined at the beginning of the file). If the code
247+
// point is found to be one of the unassigned code points,
248+
// value of variable "expected" is considered false.
249+
if (i != JAPANESE_ERA_CODEPOINT &&
250+
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
251+
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
249252
byte type = (byte) Character.getType(ch);
250253
expected = Character.isLetter(ch)
251254
|| type == Character.LETTER_NUMBER
@@ -283,11 +286,13 @@ public static void testIsJavaLetterOrDigit() {
283286
boolean expected = false;
284287
// Since Character.isJavaLetterOrDigit(char) strictly conforms to
285288
// character information from version 6.2 of the Unicode Standard,
286-
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
287-
// list. If the code point is found in list
288-
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
289-
// "expected" is considered false.
290-
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
289+
// check if code point is one of the extra unassigned
290+
// code points (defined at the beginning of the file). If the code
291+
// point is found to be one of the unassigned code points,
292+
// value of variable "expected" is considered false.
293+
if (i != JAPANESE_ERA_CODEPOINT &&
294+
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
295+
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
291296
byte type = (byte) Character.getType(ch);
292297
expected = Character.isLetter(ch)
293298
|| type == Character.CURRENCY_SYMBOL

0 commit comments

Comments
 (0)