Skip to content

Commit

Permalink
8301401: Allow additional characters for GB18030-2022 support
Browse files Browse the repository at this point in the history
Co-authored-by: Justin Lu <jlu@openjdk.org>
Reviewed-by: lancea, iris, rriggs
Backport-of: 0a700c6c3d150ed375c113b31b8e6185cbe57ae6
  • Loading branch information
naotoj and Justin Lu committed May 10, 2023
1 parent 63cb8a1 commit 75c5063
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 48 deletions.
14 changes: 7 additions & 7 deletions make/data/characterdata/CharacterData00.java.template
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -106,19 +106,19 @@ class CharacterData00 extends CharacterData {

boolean isJavaIdentifierStart(int ch) {
// isJavaIdentifierStart strictly conforms to code points assigned
// in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
// return false.
if(ch == 0x32FF)
// in Unicode 10.0. Since code points {32FF} and {9FEB..9FEF} are
// not from Unicode 10.0, return false.
if(ch == 0x32FF || (ch >= 0x9FEB && ch <= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
}

boolean isJavaIdentifierPart(int ch) {
// isJavaIdentifierPart strictly conforms to code points assigned
// in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
// return false.
if(ch == 0x32FF)
// in Unicode 10.0. Since code points {32FF} and {9FEB..9FEF} are
// not from Unicode 10.0, return false.
if(ch == 0x32FF || (ch >= 0x9FEB && ch <= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$nonzeroJavaPart) != 0);
Expand Down
2 changes: 1 addition & 1 deletion make/data/unicodedata/UnicodeData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12053,7 +12053,7 @@
4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;;
4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;;
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FEA;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;;
A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;;
Expand Down
22 changes: 13 additions & 9 deletions src/java.base/share/classes/java/lang/Character.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -54,10 +54,14 @@
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
* <p>
* The Java SE 11 Platform uses character information from version 10.0
* of the Unicode Standard, with an extension. The Java SE 11 Platform allows
* an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the first version of the Unicode Standard
* after 10.0 that assigns the code point. Consequently, the behavior of
* of the Unicode Standard, with two extensions. First, the Java SE 11 Platform
* allows an implementation of class {@code Character} to use the code points
* in the range of {@code U+9FEB} to {@code U+9FEF} from the Unicode Standard
* version 11.0, in order for the class to allow the "Implementation Level 1"
* of the Chinese GB18030-2022 standard. Second, the Java SE 11 Platform
* allows an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
* Consequently, the behavior of
* fields and methods of class {@code Character} may vary across
* implementations of the Java SE 11 Platform when processing the
* aforementioned code point ( outside of version 10.0 ), except for
Expand Down Expand Up @@ -5402,8 +5406,8 @@ public static enum UnicodeScript {
0x3400, // 3400..4DB5; HAN
0x4DB6, // 4DB6..4DBF; UNKNOWN
0x4DC0, // 4DC0..4DFF; COMMON
0x4E00, // 4E00..9FEA; HAN
0x9FEB, // 9FEB..9FFF; UNKNOWN
0x4E00, // 4E00..9FEF; HAN
0x9FF0, // 9FF0..9FFF; UNKNOWN
0xA000, // A000..A48C; YI
0xA48D, // A48D..A48F; UNKNOWN
0xA490, // A490..A4C6; YI
Expand Down Expand Up @@ -6919,8 +6923,8 @@ public static enum UnicodeScript {
HAN, // 3400..4DB5
UNKNOWN, // 4DB6..4DBF
COMMON, // 4DC0..4DFF
HAN, // 4E00..9FEA
UNKNOWN, // 9FEB..9FFF
HAN, // 4E00..9FEF
UNKNOWN, // 9FF0..9FFF
YI, // A000..A48C
UNKNOWN, // A48D..A48F
YI, // A490..A4C6
Expand Down
33 changes: 30 additions & 3 deletions test/jdk/java/lang/Character/CharPropTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -40,6 +40,11 @@ public class CharPropTest {
private static int diffs = 0;
private static int rangeStart = 0x0000;
private static boolean isRange = false;
// Japanese Era Square character code point and GB18030-2022
// code points past Unicode 10.0 are not valid identifiers for Java SE 11
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
private static final int GB18030_2022_CODEPOINT_START = 0x9FEB;
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;

public static void main(String[] args) throws Exception {
Path path = Paths.get(System.getProperty("test.src", "."),
Expand Down Expand Up @@ -168,16 +173,38 @@ private static void isIdentifierIgnorableTest(int codePoint, String category) {
}

private static void isJavaIdentifierStartTest(int codePoint, String category) {
// Since Character.isJavaIdentifierPart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
boolean expected = false;
if (codePoint != JAPANESE_ERA_CODEPOINT &&
!(codePoint >= GB18030_2022_CODEPOINT_START && codePoint <= GB18030_2022_CODEPOINT_END)) {
expected = isJavaIdentifierStart(category);
}
boolean actual = Character.isJavaIdentifierStart(codePoint);
boolean expected = isJavaIdentifierStart(category);
if (actual != expected) {
printDiff(codePoint, "isJavaIdentifierStart", actual, expected);
}
}

private static void isJavaIdentifierPartTest(int codePoint, String category) {
// Since Character.isJavaIdentifierPart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
boolean expected = false;
if (codePoint != JAPANESE_ERA_CODEPOINT &&
!(codePoint >= GB18030_2022_CODEPOINT_START && codePoint <= GB18030_2022_CODEPOINT_END)) {
expected = isJavaIdentifierPart(codePoint, category);
}
boolean actual = Character.isJavaIdentifierPart(codePoint);
boolean expected = isJavaIdentifierPart(codePoint, category);
if (actual != expected) {
printDiff(codePoint, "isJavaIdentifierPart", actual, expected);
}
Expand Down
2 changes: 1 addition & 1 deletion test/jdk/java/lang/Character/Scripts.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,7 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FEA ; Han # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
4E00..9FEF ; Han # Lo [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
Expand Down
73 changes: 47 additions & 26 deletions test/jdk/java/lang/Character/TestIsJavaIdentifierMethods.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -30,8 +30,11 @@

public class TestIsJavaIdentifierMethods {

// Japanese Era Square character code point not present in Unicode 10.0
// Japanese Era Square character code point and GB18030-2022
// code points past Unicode 10.0 are not valid identifiers for Java SE 11
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
private static final int GB18030_2022_CODEPOINT_START = 0x9FEB;
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;

public static void main(String[] args) {
testIsJavaIdentifierPart_int();
Expand Down Expand Up @@ -65,10 +68,13 @@ public static void testIsJavaIdentifierPart_int() {
boolean expected = false;
// Since Character.isJavaIdentifierPart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -110,10 +116,13 @@ public static void testIsJavaIdentifierPart_char() {
boolean expected = false;
// Since Character.isJavaIdentifierPart(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -150,10 +159,13 @@ public static void testIsJavaIdentifierStart_int() {
boolean expected = false;
// Since Character.isJavaIdentifierStart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -187,10 +199,13 @@ public static void testIsJavaIdentifierStart_char() {
boolean expected = false;
// Since Character.isJavaIdentifierStart(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -224,10 +239,13 @@ public static void testIsJavaLetter() {
boolean expected = false;
// Since Character.isJavaLetter(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -265,10 +283,13 @@ public static void testIsJavaLetterOrDigit() {
boolean expected = false;
// Since Character.isJavaLetterOrDigit(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
// check if code point is either a Japanese Era Square character
// code point or one of the GB18030-2022 code points > Unicode 10.0.
// If the code point is either a Japanese Era Square character code
// point or one of the GB18030-2022 code points > Unicode 10.0,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down
2 changes: 1 addition & 1 deletion test/jdk/java/lang/Character/UnicodeData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12053,7 +12053,7 @@
4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;;
4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;;
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FEA;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;;
A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;;
Expand Down
Binary file modified test/jdk/java/lang/Character/charprop00.bin
Binary file not shown.

1 comment on commit 75c5063

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.