Skip to content

Commit

Permalink
8301400: Allow additional characters for GB18030-2022 support
Browse files Browse the repository at this point in the history
Co-authored-by: Justin Lu <jlu@openjdk.org>
Reviewed-by: lancea, iris, rriggs
Backport-of: a039f39ee8486e990a4b4341c9e07fc49d3dac2d
  • Loading branch information
naotoj and Justin Lu committed May 10, 2023
1 parent 3147b1b commit 9740c24
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 63 deletions.
18 changes: 11 additions & 7 deletions jdk/make/data/characterdata/CharacterData00.java.template
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -106,19 +106,23 @@ class CharacterData00 extends CharacterData {

boolean isJavaIdentifierStart(int ch) {
// isJavaIdentifierStart strictly conforms to code points assigned
// in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not
// from Unicode 6.2, return false.
if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF))
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
// {9FCD..9FEF} are not from Unicode 6.2, return false.
if(ch == 0x32FF ||
(ch>= 0x20BB && ch<= 0x20BF) ||
(ch>= 0x9FCD && ch<= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
}

boolean isJavaIdentifierPart(int ch) {
// isJavaIdentifierPart strictly conforms to code points assigned
// in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not
// from Unicode 6.2, return false.
if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF))
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
// {9FCD..9FEF} are not from Unicode 6.2, return false.
if(ch == 0x32FF ||
(ch>= 0x20BB && ch<= 0x20BF) ||
(ch>= 0x9FCD && ch<= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$nonzeroJavaPart) != 0);
Expand Down
2 changes: 1 addition & 1 deletion jdk/make/data/unicodedata/UnicodeData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11732,7 +11732,7 @@
4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;;
4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;;
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;;
A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;;
Expand Down
16 changes: 10 additions & 6 deletions jdk/src/share/classes/java/lang/Character.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -52,13 +52,17 @@
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
* <p>
* The Java SE 8 Platform uses character information from version 6.2
* of the Unicode Standard, with two extensions. First, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the first version of the Unicode Standard
* after 6.2 that assigns the code point. Second, in recognition of the fact
* of the Unicode Standard, with three extensions. First, in recognition of the fact
* that new currencies appear frequently, the Java SE 8 Platform allows an
* implementation of class {@code Character} to use the Currency Symbols
* block from version 10.0 of the Unicode Standard. Consequently, the
* block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the code points
* in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the
* Unicode Standard, in order for the class to allow the "Implementation
* Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
* Consequently, the
* behavior of fields and methods of class {@code Character} may vary across
* implementations of the Java SE 8 Platform when processing the aforementioned
* code points ( outside of version 6.2 ), except for the following methods
Expand Down
4 changes: 2 additions & 2 deletions jdk/test/java/lang/Character/Scripts.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1433,15 +1433,15 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FCC ; Han # Lo [20941] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FCC
4E00..9FEF ; Han # Lo [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Total code points: 75963
# Total code points: 75998

# ================================================

Expand Down
99 changes: 52 additions & 47 deletions jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -28,22 +28,16 @@
* @bug 8218915
*/

import java.util.List;
import java.util.ArrayList;

public class TestIsJavaIdentifierMethods {

// List of new code points are not present in Unicode 6.2.
private static final List<Integer> UNASSIGNED_CODEPOINTS_IN_6_2
= new ArrayList<Integer>()
{{
add(0x20BB); // NORDIC MARK SIGN
add(0x20BC); // MANAT SIGN
add(0x20BD); // RUBLE SIGN
add(0x20BE); // LARI SIGN
add(0x20BF); // BITCOIN SIGN
add(0x32FF); // SQUARE ERA NAME NEWERA
}};
// Unassigned code points not present in Unicode 6.2 (which Java SE 8
// is based upon), including: various currency symbol sign code points
// (Nordic Mark ... Bitcoin), Japanese Era Square character code point,
// and 35 CJK Unified Ideograph code points from GB18030-2022
private static final int CS_SIGNS_CODEPOINT_START = 0x20BB;
private static final int CS_SIGNS_CODEPOINT_END = 0x20BF;
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
private static final int GB18030_2022_CODEPOINT_START = 0x9FCD;
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;

public static void main(String[] args) {
testIsJavaIdentifierPart_int();
Expand Down Expand Up @@ -75,14 +69,15 @@ public static void main(String[] args) {
public static void testIsJavaIdentifierPart_int() {
for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
boolean expected = false;

// Since Character.isJavaIdentifierPart(int) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -124,11 +119,13 @@ public static void testIsJavaIdentifierPart_char() {
boolean expected = false;
// Since Character.isJavaIdentifierPart(char) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -165,11 +162,13 @@ public static void testIsJavaIdentifierStart_int() {
boolean expected = false;
// Since Character.isJavaIdentifierStart(int) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -203,11 +202,13 @@ public static void testIsJavaIdentifierStart_char() {
boolean expected = false;
// Since Character.isJavaIdentifierStart(char) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -241,11 +242,13 @@ public static void testIsJavaLetter() {
boolean expected = false;
// Since Character.isJavaLetter(char) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -283,11 +286,13 @@ public static void testIsJavaLetterOrDigit() {
boolean expected = false;
// Since Character.isJavaLetterOrDigit(char) strictly conforms to
// character information from version 6.2 of the Unicode Standard,
// check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2"
// list. If the code point is found in list
// "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable
// "expected" is considered false.
if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) {
// check if code point is one of the extra unassigned
// code points (defined at the beginning of the file). If the code
// point is found to be one of the unassigned code points,
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down

1 comment on commit 9740c24

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.