Skip to content

Commit

Permalink
8301400: Allow additional characters for GB18030-2022 support
Browse files Browse the repository at this point in the history
Co-authored-by: Justin Lu <jlu@openjdk.org>
Reviewed-by: andrew, lancea, iris, alanb
  • Loading branch information
naotoj and Justin Lu committed Mar 2, 2023
1 parent 9176874 commit a039f39
Show file tree
Hide file tree
Showing 5 changed files with 367 additions and 11 deletions.
16 changes: 15 additions & 1 deletion jdk/make/data/characterdata/CharacterData00.java.template
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -105,11 +105,25 @@ class CharacterData00 extends CharacterData {
}

boolean isJavaIdentifierStart(int ch) {
// isJavaIdentifierStart strictly conforms to code points assigned
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
// {9FCD..9FEF} are not from Unicode 6.2, return false.
if(ch == 0x32FF ||
(ch>= 0x20BB && ch<= 0x20BF) ||
(ch>= 0x9FCD && ch<= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
}

boolean isJavaIdentifierPart(int ch) {
// isJavaIdentifierPart strictly conforms to code points assigned
// in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and
// {9FCD..9FEF} are not from Unicode 6.2, return false.
if(ch == 0x32FF ||
(ch>= 0x20BB && ch<= 0x20BF) ||
(ch>= 0x9FCD && ch<= 0x9FEF))
return false;
int props = getProperties(ch);
return ((props & $$nonzeroJavaPart) != 0);
}
Expand Down
8 changes: 7 additions & 1 deletion jdk/make/data/unicodedata/UnicodeData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7191,6 +7191,11 @@
20B8;TENGE SIGN;Sc;0;ET;;;;;N;;;;;
20B9;INDIAN RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
20BA;TURKISH LIRA SIGN;Sc;0;ET;;;;;N;;;;;
20BB;NORDIC MARK SIGN;Sc;0;ET;;;;;N;;;;;
20BC;MANAT SIGN;Sc;0;ET;;;;;N;;;;;
20BD;RUBLE SIGN;Sc;0;ET;;;;;N;;;;;
20BE;LARI SIGN;Sc;0;ET;;;;;N;;;;;
20BF;BITCOIN SIGN;Sc;0;ET;;;;;N;;;;;
20D0;COMBINING LEFT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING LEFT HARPOON ABOVE;;;;
20D1;COMBINING RIGHT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING RIGHT HARPOON ABOVE;;;;
20D2;COMBINING LONG VERTICAL LINE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING LONG VERTICAL BAR OVERLAY;;;;
Expand Down Expand Up @@ -11403,6 +11408,7 @@
32FC;CIRCLED KATAKANA WI;So;0;L;<circle> 30F0;;;;N;;;;;
32FD;CIRCLED KATAKANA WE;So;0;L;<circle> 30F1;;;;N;;;;;
32FE;CIRCLED KATAKANA WO;So;0;L;<circle> 30F2;;;;N;;;;;
32FF;SQUARE ERA NAME NEWERA;So;0;L;<square> 5143 53F7;;;;N;SQUARED TWO IDEOGRAPHS ERA NAME NEWERA;;;;
3300;SQUARE APAATO;So;0;L;<square> 30A2 30D1 30FC 30C8;;;;N;SQUARED APAATO;;;;
3301;SQUARE ARUHUA;So;0;L;<square> 30A2 30EB 30D5 30A1;;;;N;SQUARED ARUHUA;;;;
3302;SQUARE ANPEA;So;0;L;<square> 30A2 30F3 30DA 30A2;;;;N;SQUARED ANPEA;;;;
Expand Down Expand Up @@ -11726,7 +11732,7 @@
4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;;
4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;;
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;;
A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;;
Expand Down
34 changes: 27 additions & 7 deletions jdk/src/share/classes/java/lang/Character.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -49,10 +49,26 @@
* and general category for every defined Unicode code point or
* character range.
* <p>
* The file and its description are available from the Unicode Consortium at:
* <ul>
* <li><a href="http://www.unicode.org">http://www.unicode.org</a>
* </ul>
* The Java SE 8 Platform uses character information from version 6.2
* of the Unicode Standard, with three extensions. First, in recognition of the fact
* that new currencies appear frequently, the Java SE 8 Platform allows an
* implementation of class {@code Character} to use the Currency Symbols
* block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the code points
* in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the
* Unicode Standard, in order for the class to allow the "Implementation
* Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
* Consequently, the
* behavior of fields and methods of class {@code Character} may vary across
* implementations of the Java SE 8 Platform when processing the aforementioned
* code points ( outside of version 6.2 ), except for the following methods
* that define Java identifiers:
* {@link #isJavaIdentifierStart(int)}, {@link #isJavaIdentifierStart(char)},
* {@link #isJavaIdentifierPart(int)}, and {@link #isJavaIdentifierPart(char)}.
* Code points in Java identifiers must be drawn from version 6.2 of
* the Unicode Standard.
*
* <h3><a name="unicode">Unicode Character Representations</a></h3>
*
Expand Down Expand Up @@ -3914,7 +3930,9 @@ public static enum UnicodeScript {
0x3220, // 3220..325F; COMMON
0x3260, // 3260..327E; HANGUL
0x327F, // 327F..32CF; COMMON
0x32D0, // 32D0..3357; KATAKANA
0x32D0, // 32D0..32FE; KATAKANA
0x32FF, // 32FF ; COMMON
0x3300, // 3300..3357; KATAKANA
0x3358, // 3358..33FF; COMMON
0x3400, // 3400..4DBF; HAN
0x4DC0, // 4DC0..4DFF; COMMON
Expand Down Expand Up @@ -4235,7 +4253,9 @@ public static enum UnicodeScript {
COMMON,
HANGUL,
COMMON,
KATAKANA,
KATAKANA, // 32D0..32FE
COMMON, // 32FF
KATAKANA, // 3300..3357
COMMON,
HAN,
COMMON,
Expand Down
6 changes: 4 additions & 2 deletions jdk/test/java/lang/Character/Scripts.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN
20BB..20BF ; Common # Sc [5] NORDIC MARK SIGN..BITCOIN SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
Expand Down Expand Up @@ -381,6 +382,7 @@
328A..32B0 ; Common # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32B1..32BF ; Common # No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
32C0..32CF ; Common # So [16] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..LIMITED LIABILITY SIGN
32FF ; Common # So SQUARE ERA NAME NEWERA
3358..33FF ; Common # So [168] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..SQUARE GAL
4DC0..4DFF ; Common # So [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION
A700..A716 ; Common # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR
Expand Down Expand Up @@ -1431,15 +1433,15 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FCC ; Han # Lo [20941] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FCC
4E00..9FEF ; Han # Lo [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Total code points: 75963
# Total code points: 75998

# ================================================

Expand Down
Loading

1 comment on commit a039f39

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.