Skip to content

Commit 2aac0e9

Browse files
committed
8229831: Upgrade Character.isUnicodeIdentifierStart/Part() methods to the latest standard
Reviewed-by: rriggs
1 parent 4d70cda commit 2aac0e9

File tree

13 files changed

+12096
-75
lines changed

13 files changed

+12096
-75
lines changed

make/data/characterdata/CharacterData00.java.template

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,14 @@ class CharacterData00 extends CharacterData {
115115
}
116116

117117
boolean isUnicodeIdentifierStart(int ch) {
118-
int props = getProperties(ch);
119-
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
118+
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
119+
ch == 0x2E2F;
120120
}
121121

122122
boolean isUnicodeIdentifierPart(int ch) {
123-
int props = getProperties(ch);
124-
return ((props & $$maskUnicodePart) != 0);
123+
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
124+
isIdentifierIgnorable(ch) ||
125+
ch == 0x2E2F;
125126
}
126127

127128
boolean isIdentifierIgnorable(int ch) {

make/data/characterdata/CharacterData01.java.template

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,14 @@ class CharacterData01 extends CharacterData {
114114
}
115115

116116
boolean isUnicodeIdentifierStart(int ch) {
117-
int props = getProperties(ch);
118-
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
117+
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
118+
ch == 0x2E2F;
119119
}
120120

121121
boolean isUnicodeIdentifierPart(int ch) {
122-
int props = getProperties(ch);
123-
return ((props & $$maskUnicodePart) != 0);
122+
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
123+
isIdentifierIgnorable(ch) ||
124+
ch == 0x2E2F;
124125
}
125126

126127
boolean isIdentifierIgnorable(int ch) {

make/data/characterdata/CharacterData02.java.template

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -113,13 +113,14 @@ class CharacterData02 extends CharacterData {
113113
}
114114

115115
boolean isUnicodeIdentifierStart(int ch) {
116-
int props = getProperties(ch);
117-
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
116+
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
117+
ch == 0x2E2F;
118118
}
119119

120120
boolean isUnicodeIdentifierPart(int ch) {
121-
int props = getProperties(ch);
122-
return ((props & $$maskUnicodePart) != 0);
121+
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
122+
isIdentifierIgnorable(ch) ||
123+
ch == 0x2E2F;
123124
}
124125

125126
boolean isIdentifierIgnorable(int ch) {

make/data/characterdata/CharacterData0E.java.template

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -113,15 +113,16 @@ class CharacterData0E extends CharacterData {
113113
}
114114

115115
boolean isUnicodeIdentifierStart(int ch) {
116-
int props = getProperties(ch);
117-
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
116+
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
117+
ch == 0x2E2F;
118118
}
119119

120120
boolean isUnicodeIdentifierPart(int ch) {
121-
int props = getProperties(ch);
122-
return ((props & $$maskUnicodePart) != 0);
121+
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
122+
isIdentifierIgnorable(ch) ||
123+
ch == 0x2E2F;
123124
}
124-
125+
125126
boolean isIdentifierIgnorable(int ch) {
126127
int props = getProperties(ch);
127128
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);

make/data/characterdata/CharacterDataLatin1.java.template

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -133,13 +133,14 @@ class CharacterDataLatin1 extends CharacterData {
133133
}
134134

135135
boolean isUnicodeIdentifierStart(int ch) {
136-
int props = getProperties(ch);
137-
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
136+
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
137+
ch == 0x2E2F;
138138
}
139139

140140
boolean isUnicodeIdentifierPart(int ch) {
141-
int props = getProperties(ch);
142-
return ((props & $$maskUnicodePart) != 0);
141+
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
142+
isIdentifierIgnorable(ch) ||
143+
ch == 0x2E2F;
143144
}
144145

145146
boolean isIdentifierIgnorable(int ch) {

make/data/unicodedata/DerivedCoreProperties.txt

Lines changed: 11885 additions & 0 deletions
Large diffs are not rendered by default.

make/gensrc/GensrcCharacterData.gmk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ define SetupCharacterData
4242
-spec $(UNICODEDATA)/UnicodeData.txt \
4343
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
4444
-proplist $(UNICODEDATA)/PropList.txt \
45+
-derivedprops $(UNICODEDATA)/DerivedCoreProperties.txt \
4546
-o $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/lang/$1.java \
4647
-usecharforbyte $3
4748

make/jdk/src/classes/build/tools/generatecharacter/GenerateCharacter.java

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -73,6 +73,7 @@ public class GenerateCharacter {
7373
static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
7474
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
7575
static String DefaultPropListFileName = ROOT + "PropList.txt";
76+
static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
7677
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
7778
static String DefaultJavaOutputFileName = ROOT + "Character.java";
7879
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
@@ -159,6 +160,8 @@ the values in the table can be preshifted (generally possible if the table
159160
1 bit Other_Math property
160161
1 bit Ideographic property
161162
1 bit Noncharacter codepoint property
163+
1 bit ID_Start property
164+
1 bit ID_Continue property
162165
*/
163166

164167

@@ -190,15 +193,17 @@ the values in the table can be preshifted (generally possible if the table
190193
// maskMirrored needs to be long, if up 16-bit
191194
private static final long maskMirrored = 0x80000000L;
192195

193-
// bit masks identify the 16-bit priperty field described above, in B
196+
// bit masks identify the 16-bit property field described above, in B
194197
// table
195198
private static final long
196199
maskOtherLowercase = 0x100000000L,
197200
maskOtherUppercase = 0x200000000L,
198201
maskOtherAlphabetic = 0x400000000L,
199202
maskOtherMath = 0x800000000L,
200203
maskIdeographic = 0x1000000000L,
201-
maskNoncharacterCP = 0x2000000000L;
204+
maskNoncharacterCP = 0x2000000000L,
205+
maskIDStart = 0x4000000000L,
206+
maskIDContinue = 0x8000000000L;
202207

203208
// Can compare masked values with these to determine
204209
// numeric or lexical types.
@@ -367,6 +372,8 @@ else if(data[j].codePoint > codePoint) {
367372
addExProp(result, propList, "Ideographic", maskIdeographic);
368373
//addExProp(result, propList, "Other_Math", maskOtherMath);
369374
//addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
375+
addExProp(result, propList, "ID_Start", maskIDStart);
376+
addExProp(result, propList, "ID_Continue", maskIDContinue);
370377

371378
return result;
372379
}
@@ -780,6 +787,8 @@ static String replaceCommand(String x) {
780787
if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781788
if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782789
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
790+
if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
791+
if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
783792
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784793
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785794
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
@@ -1612,6 +1621,7 @@ static String genAccess(String tbl, String var, int bits) {
16121621
static String UnicodeSpecFileName = null; // liu
16131622
static String SpecialCasingFileName = null;
16141623
static String PropListFileName = null;
1624+
static String DerivedPropsFileName = null;
16151625
static boolean useCharForByte = false;
16161626
static int[] sizes;
16171627
static int bins = 0; // liu; if > 0, then perform search
@@ -1739,6 +1749,14 @@ else if (args[j].equals("-proplist")) {
17391749
PropListFileName = args[++j];
17401750
}
17411751
}
1752+
else if (args[j].equals("-derivedprops")) {
1753+
if (j == args.length -1) {
1754+
FAIL("File name missing after -derivedprops");
1755+
}
1756+
else {
1757+
DerivedPropsFileName = args[++j];
1758+
}
1759+
}
17421760
else if (args[j].equals("-plane")) {
17431761
if (j == args.length -1) {
17441762
FAIL("Plane number missing after -plane");
@@ -1803,6 +1821,10 @@ else if (args[j].equals("-latin1")) {
18031821
PropListFileName = DefaultPropListFileName;
18041822
desc.append(" [-proplist " + PropListFileName + ']');
18051823
}
1824+
if (DerivedPropsFileName == null) {
1825+
DerivedPropsFileName = DefaultDerivedPropsFileName;
1826+
desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
1827+
}
18061828
if (TemplateFileName == null) {
18071829
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
18081830
: DefaultJavaTemplateFileName);
@@ -1954,6 +1976,7 @@ public static void main(String[] args) {
19541976
UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
19551977
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
19561978
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1979+
propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
19571980

19581981
if (verbose) {
19591982
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu

make/jdk/src/classes/build/tools/generatecharacter/PropList.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -31,7 +31,8 @@
3131

3232
/**
3333
* A PropList object contains the lists of code points that have
34-
* the same Unicode property defined in PropList.txt
34+
* the same Unicode property defined in PropList.txt and
35+
* DerivedCoreProperties.txt
3536
*
3637
* @author Xueming Shen
3738
*/
@@ -51,8 +52,13 @@ public Set<String> names() {
5152
return propMap.keySet();
5253
}
5354

54-
private Map<String, ArrayList<Integer>> propMap =
55-
new LinkedHashMap<String, ArrayList<Integer>>();
55+
public void putAll(PropList pl) {
56+
pl.names().stream()
57+
.forEach(name -> propMap.put(name, pl.codepoints(name)));
58+
}
59+
60+
private Map<String, List<Integer>> propMap =
61+
new LinkedHashMap<String, List<Integer>>();
5662

5763
private PropList(File file, int plane) throws IOException {
5864

@@ -78,7 +84,7 @@ private PropList(File file, int plane) throws IOException {
7884
start &= 0xffff;
7985
end &= 0xffff;
8086

81-
ArrayList<Integer> list = propMap.get(name);
87+
List<Integer> list = propMap.get(name);
8288
if (list == null) {
8389
list = new ArrayList<Integer>();
8490
propMap.put(name, list);

src/java.base/share/classes/java/lang/Character.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9917,7 +9917,18 @@ public static boolean isJavaIdentifierPart(int codePoint) {
99179917
* <li> {@link #isLetter(char) isLetter(ch)} returns {@code true}
99189918
* <li> {@link #getType(char) getType(ch)} returns
99199919
* {@code LETTER_NUMBER}.
9920+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
9921+
* {@code Other_ID_Start}</a> character.
99209922
* </ul>
9923+
* <p>
9924+
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
9925+
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
9926+
* with the following profile of UAX31:
9927+
* <pre>
9928+
* Start := ID_Start + 'VERTICAL TILDE' (U+2E2F)
9929+
* </pre>
9930+
* {@code 'VERTICAL TILDE'} is added to {@code Start} for backward
9931+
* compatibility.
99219932
*
99229933
* <p><b>Note:</b> This method cannot handle <a
99239934
* href="#supplementary"> supplementary characters</a>. To support
@@ -9947,7 +9958,19 @@ public static boolean isUnicodeIdentifierStart(char ch) {
99479958
* returns {@code true}
99489959
* <li> {@link #getType(int) getType(codePoint)}
99499960
* returns {@code LETTER_NUMBER}.
9961+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
9962+
* {@code Other_ID_Start}</a> character.
99509963
* </ul>
9964+
* <p>
9965+
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
9966+
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
9967+
* with the following profile of UAX31:
9968+
* <pre>
9969+
* Start := ID_Start + 'VERTICAL TILDE' (U+2E2F)
9970+
* </pre>
9971+
* {@code 'VERTICAL TILDE'} is added to {@code Start} for backward
9972+
* compatibility.
9973+
*
99519974
* @param codePoint the character (Unicode code point) to be tested.
99529975
* @return {@code true} if the character may start a Unicode
99539976
* identifier; {@code false} otherwise.
@@ -9975,7 +9998,22 @@ public static boolean isUnicodeIdentifierStart(int codePoint) {
99759998
* <li> it is a non-spacing mark
99769999
* <li> {@code isIdentifierIgnorable} returns
997710000
* {@code true} for this character.
10001+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
10002+
* {@code Other_ID_Start}</a> character.
10003+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Continue">
10004+
* {@code Other_ID_Continue}</a> character.
997810005
* </ul>
10006+
* <p>
10007+
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
10008+
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
10009+
* with the following profile of UAX31:
10010+
* <pre>
10011+
* Continue := Start + ID_Continue + ignorable
10012+
* Medial := empty
10013+
* ignorable := isIdentifierIgnorable(char) returns true for the character
10014+
* </pre>
10015+
* {@code ignorable} is added to {@code Continue} for backward
10016+
* compatibility.
997910017
*
998010018
* <p><b>Note:</b> This method cannot handle <a
998110019
* href="#supplementary"> supplementary characters</a>. To support
@@ -10010,7 +10048,23 @@ public static boolean isUnicodeIdentifierPart(char ch) {
1001010048
* <li> it is a non-spacing mark
1001110049
* <li> {@code isIdentifierIgnorable} returns
1001210050
* {@code true} for this character.
10051+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
10052+
* {@code Other_ID_Start}</a> character.
10053+
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Continue">
10054+
* {@code Other_ID_Continue}</a> character.
1001310055
* </ul>
10056+
* <p>
10057+
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
10058+
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
10059+
* with the following profile of UAX31:
10060+
* <pre>
10061+
* Continue := Start + ID_Continue + ignorable
10062+
* Medial := empty
10063+
* ignorable := isIdentifierIgnorable(int) returns true for the character
10064+
* </pre>
10065+
* {@code ignorable} is added to {@code Continue} for backward
10066+
* compatibility.
10067+
*
1001410068
* @param codePoint the character (Unicode code point) to be tested.
1001510069
* @return {@code true} if the character may be part of a
1001610070
* Unicode identifier; {@code false} otherwise.

0 commit comments

Comments
 (0)