Skip to content

Commit 3d9e8d0

Browse files
naotojJustin Lu
andcommitted
8305681: Allow additional characters for GB18030-2022 (Level 2) support
Co-authored-by: Justin Lu <jlu@openjdk.org> Reviewed-by: lancea, iris, rriggs Backport-of: b08cddec8625424b1292051088513a60606ef1e9
1 parent 9740c24 commit 3d9e8d0

File tree

6 files changed

+55
-18
lines changed

6 files changed

+55
-18
lines changed

jdk/make/data/characterdata/CharacterData02.java.template

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,21 @@ class CharacterData02 extends CharacterData {
103103
}
104104

105105
boolean isJavaIdentifierStart(int ch) {
106+
// isJavaIdentifierStart strictly conforms to code points assigned
107+
// in Unicode 6.2.
108+
if(Character.UnicodeBlock.of(ch) ==
109+
Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E)
110+
return false;
106111
int props = getProperties(ch);
107112
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
108113
}
109114

110115
boolean isJavaIdentifierPart(int ch) {
116+
// isJavaIdentifierPart strictly conforms to code points assigned
117+
// in Unicode 6.2.
118+
if(Character.UnicodeBlock.of(ch) ==
119+
Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E)
120+
return false;
111121
int props = getProperties(ch);
112122
return ((props & $$nonzeroJavaPart) != 0);
113123
}

jdk/make/data/unicodedata/UnicodeData.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23550,6 +23550,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
2355023550
2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
2355123551
2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
2355223552
2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
23553+
2B820;<CJK Ideograph Extension E, First>;Lo;0;L;;;;;N;;;;;
23554+
2CEA1;<CJK Ideograph Extension E, Last>;Lo;0;L;;;;;N;;;;;
2355323555
2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;;
2355423556
2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;;
2355523557
2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;;

jdk/src/share/classes/java/lang/Character.java

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@
5858
* block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform
5959
* allows an implementation of class {@code Character} to use the code points
6060
* in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the
61-
* Unicode Standard, in order for the class to allow the "Implementation
62-
* Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform
61+
* Unicode Standard and in the {@code CJK Unified Ideographs Extension E} block
62+
* from version 8.0 of the Unicode Standard, in order for the class to allow the
63+
* "Implementation Level 2" of the Chinese GB18030-2022 standard.
64+
* Third, the Java SE 8 Platform
6365
* allows an implementation of class {@code Character} to use the Japanese Era
6466
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
6567
* Consequently, the
@@ -2575,7 +2577,18 @@ private UnicodeBlock(String idName, String... aliases) {
25752577
"ARABIC MATHEMATICAL ALPHABETIC SYMBOLS",
25762578
"ARABICMATHEMATICALALPHABETICSYMBOLS");
25772579

2578-
private static final int[] blockStarts = {
2580+
/**
2581+
* Constant for the "CJK Unified Ideographs Extension E" Unicode
2582+
* character block.
2583+
* @apiNote This field is defined in Java SE 8 Maintenance Release 5.
2584+
* @since 1.8
2585+
*/
2586+
public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E =
2587+
new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E",
2588+
"CJK UNIFIED IDEOGRAPHS EXTENSION E",
2589+
"CJKUNIFIEDIDEOGRAPHSEXTENSIONE");
2590+
2591+
private static final int blockStarts[] = {
25792592
0x0000, // 0000..007F; Basic Latin
25802593
0x0080, // 0080..00FF; Latin-1 Supplement
25812594
0x0100, // 0100..017F; Latin Extended-A
@@ -2823,7 +2836,8 @@ private UnicodeBlock(String idName, String... aliases) {
28232836
0x2A6E0, // unassigned
28242837
0x2A700, // 2A700..2B73F; CJK Unified Ideographs Extension C
28252838
0x2B740, // 2B740..2B81F; CJK Unified Ideographs Extension D
2826-
0x2B820, // unassigned
2839+
0x2B820, // 2B820..2CEAF; CJK Unified Ideographs Extension E
2840+
0x2CEB0, // unassigned
28272841
0x2F800, // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
28282842
0x2FA20, // unassigned
28292843
0xE0000, // E0000..E007F; Tags
@@ -3082,6 +3096,7 @@ private UnicodeBlock(String idName, String... aliases) {
30823096
null,
30833097
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
30843098
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
3099+
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
30853100
null,
30863101
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
30873102
null,

jdk/test/java/lang/Character/CheckScript.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
/*
3-
* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
3+
* Copyright (c) 2010, 2023, Oracle and/or its affiliates. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
66
* This code is free software; you can redistribute it and/or modify it
@@ -24,7 +24,7 @@
2424

2525
/**
2626
* @test
27-
* @bug 6945564 6959267 7033561 7070436 7198195
27+
* @bug 6945564 6959267 7033561 7070436 7198195 8305681
2828
* @summary Check that the j.l.Character.UnicodeScript
2929
*/
3030

jdk/test/java/lang/Character/Scripts.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1439,9 +1439,10 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI
14391439
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
14401440
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
14411441
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
1442+
2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
14421443
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
14431444

1444-
# Total code points: 75998
1445+
# Total code points: 81760
14451446

14461447
# ================================================
14471448

jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,26 @@
2525
* @test
2626
* @summary Test behavior of isJavaIdentifierXX, testIsJavaLetter, and
2727
* testIsJavaLetterOrDigit methods for all code points.
28-
* @bug 8218915
28+
* @bug 8218915 8301400 8305681
2929
*/
3030

3131
public class TestIsJavaIdentifierMethods {
3232
// Unassigned code points not present in Unicode 6.2 (which Java SE 8
3333
// is based upon), including: various currency symbol sign code points
34-
// (Nordic Mark ... Bitcoin), Japanese Era Square character code point,
35-
// and 35 CJK Unified Ideograph code points from GB18030-2022
34+
// (Nordic Mark ... Bitcoin), the Japanese Era Square character code point, and
35+
// code points for GB1030-2022 level 1 and 2 implementation including
36+
// (35 code points from CJK Unified Ideographs and all of CJK Unified Ideographs
37+
// Extension E).
3638
private static final int CS_SIGNS_CODEPOINT_START = 0x20BB;
3739
private static final int CS_SIGNS_CODEPOINT_END = 0x20BF;
3840
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
39-
private static final int GB18030_2022_CODEPOINT_START = 0x9FCD;
40-
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;
41+
// GB18030_2022 Code Points
42+
private static final int CJK_GB18030_LEVEL1_START = 0x9FCD;
43+
private static final int CJK_GB18030_LEVEL1_END = 0x9FEF;
44+
// Extension E code points are greater than U+FFFF,
45+
// and thus only the int methods need to be tested
46+
private static final int CJK_EXTENSION_E_START = 0x2B820;
47+
private static final int CJK_EXTENSION_E_END = 0x2CEAF;
4148

4249
public static void main(String[] args) {
4350
testIsJavaIdentifierPart_int();
@@ -77,7 +84,8 @@ public static void testIsJavaIdentifierPart_int() {
7784
// value of variable "expected" is considered false.
7885
if (cp != JAPANESE_ERA_CODEPOINT &&
7986
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
80-
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
87+
!(cp >= CJK_GB18030_LEVEL1_START && cp <= CJK_GB18030_LEVEL1_END) &&
88+
!(cp >= CJK_EXTENSION_E_START && cp <= CJK_EXTENSION_E_END)) {
8189
byte type = (byte) Character.getType(cp);
8290
expected = Character.isLetter(cp)
8391
|| type == Character.CURRENCY_SYMBOL
@@ -125,7 +133,7 @@ public static void testIsJavaIdentifierPart_char() {
125133
// value of variable "expected" is considered false.
126134
if (i != JAPANESE_ERA_CODEPOINT &&
127135
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
128-
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
136+
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
129137
byte type = (byte) Character.getType(ch);
130138
expected = Character.isLetter(ch)
131139
|| type == Character.CURRENCY_SYMBOL
@@ -168,7 +176,8 @@ public static void testIsJavaIdentifierStart_int() {
168176
// value of variable "expected" is considered false.
169177
if (cp != JAPANESE_ERA_CODEPOINT &&
170178
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
171-
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
179+
!(cp >= CJK_GB18030_LEVEL1_START && cp <= CJK_GB18030_LEVEL1_END) &&
180+
!(cp >= CJK_EXTENSION_E_START && cp <= CJK_EXTENSION_E_END)) {
172181
byte type = (byte) Character.getType(cp);
173182
expected = Character.isLetter(cp)
174183
|| type == Character.LETTER_NUMBER
@@ -208,7 +217,7 @@ public static void testIsJavaIdentifierStart_char() {
208217
// value of variable "expected" is considered false.
209218
if (i != JAPANESE_ERA_CODEPOINT &&
210219
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
211-
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
220+
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
212221
byte type = (byte) Character.getType(ch);
213222
expected = Character.isLetter(ch)
214223
|| type == Character.LETTER_NUMBER
@@ -248,7 +257,7 @@ public static void testIsJavaLetter() {
248257
// value of variable "expected" is considered false.
249258
if (i != JAPANESE_ERA_CODEPOINT &&
250259
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
251-
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
260+
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
252261
byte type = (byte) Character.getType(ch);
253262
expected = Character.isLetter(ch)
254263
|| type == Character.LETTER_NUMBER
@@ -292,7 +301,7 @@ public static void testIsJavaLetterOrDigit() {
292301
// value of variable "expected" is considered false.
293302
if (i != JAPANESE_ERA_CODEPOINT &&
294303
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
295-
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
304+
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
296305
byte type = (byte) Character.getType(ch);
297306
expected = Character.isLetter(ch)
298307
|| type == Character.CURRENCY_SYMBOL

0 commit comments

Comments
 (0)