Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
8303018: Unicode Emoji Properties
Reviewed-by: prr, erikj, rriggs
  • Loading branch information
naotoj committed Mar 20, 2023
1 parent bc0ed73 commit f593a6b
Show file tree
Hide file tree
Showing 19 changed files with 643 additions and 278 deletions.
93 changes: 93 additions & 0 deletions make/jdk/src/classes/build/tools/generatecharacter/EmojiData.java
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package build.tools.generatecharacter;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.AbstractMap;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
* A class holding emoji character properties
* https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files
*/
class EmojiData {
// Emoji properties map
private final Map<Integer, Long> emojiProps;

static EmojiData readSpecFile(Path file, int plane) throws IOException {
return new EmojiData(file, plane);
}

EmojiData(Path file, int plane) throws IOException {
emojiProps = Files.readAllLines(file).stream()
.map(line -> line.split("#", 2)[0])
.filter(Predicate.not(String::isBlank))
.map(line -> line.split("[ \t]*;[ \t]*", 2))
.flatMap(map -> {
var range = map[0].split("\\.\\.", 2);
var start = Integer.valueOf(range[0], 16);
if ((start >> 16) != plane) {
return Stream.empty();
} else {
return range.length == 1 ?
Stream.of(new AbstractMap.SimpleEntry<>(start, convertType(map[1].trim()))) :
IntStream.rangeClosed(start, Integer.valueOf(range[1], 16))
.mapToObj(cp -> new AbstractMap.SimpleEntry<>(cp, convertType(map[1].trim())));
}
})
.collect(Collectors.toMap(AbstractMap.SimpleEntry::getKey,
AbstractMap.SimpleEntry::getValue,
(v1, v2) -> v1 | v2));
}

long properties(int cp) {
return emojiProps.get(cp);
}

Set<Integer> codepoints() {
return emojiProps.keySet();
}

private static long convertType(String type) {
return switch (type) {
case "Emoji" -> GenerateCharacter.maskEmoji;
case "Emoji_Presentation" -> GenerateCharacter.maskEmojiPresentation;
case "Emoji_Modifier" -> GenerateCharacter.maskEmojiModifier;
case "Emoji_Modifier_Base" -> GenerateCharacter.maskEmojiModifierBase;
case "Emoji_Component" -> GenerateCharacter.maskEmojiComponent;
case "Extended_Pictographic" -> GenerateCharacter.maskExtendedPictographic;
default -> throw new InternalError("Unrecognizable Emoji type: " + type);
};
}
}
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -33,6 +33,7 @@
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.File;
import java.nio.file.Paths;
import java.util.List;

import build.tools.generatecharacter.CharacterName;
Expand Down Expand Up @@ -74,6 +75,7 @@ public class GenerateCharacter {
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
static String DefaultPropListFileName = ROOT + "PropList.txt";
static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
static String DefaultEmojiDataFileName = ROOT + "emoji-data.txt";
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
static String DefaultJavaOutputFileName = ROOT + "Character.java";
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
Expand Down Expand Up @@ -105,7 +107,7 @@ the values in the table can be preshifted (generally possible if the table
entries are short rather than byte).
*/

/* The character properties are currently encoded into A (32 bits) and B (8 bits)
/* The character properties are currently encoded into A (32 bits) and B (16 bits)
two parts.
A: the low 32 bits are defined in the following manner:
Expand Down Expand Up @@ -160,6 +162,13 @@ the values in the table can be preshifted (generally possible if the table
1 bit Ideographic property
1 bit ID_Start property
1 bit ID_Continue property
6 bits for Emoji properties :-
1 bit for Emoji
1 bit for Emoji_Presentation
1 bit for Emoji_Modifier
1 bit for Emoji_Modifier_Base
1 bit for Emoji_Component
1 bit for Extended_Pictographic
*/


Expand Down Expand Up @@ -188,15 +197,21 @@ the values in the table can be preshifted (generally possible if the table
// maskMirrored needs to be long, if up 16-bit
private static final long maskMirrored = 0x80000000L;

// bit masks identify the 8-bit property field described above, in B
// bit masks identify the 16-bit property field described above, in B
// table
private static final long
maskOtherLowercase = 0x0100000000L,
maskOtherUppercase = 0x0200000000L,
maskOtherAlphabetic = 0x0400000000L,
maskIdeographic = 0x0800000000L,
maskIDStart = 0x1000000000L,
maskIDContinue = 0x2000000000L;
static final long
maskOtherLowercase = 1L << 32,
maskOtherUppercase = 1L << 33,
maskOtherAlphabetic = 1L << 34,
maskIdeographic = 1L << 35,
maskIDStart = 1L << 36,
maskIDContinue = 1L << 37,
maskEmoji = 1L << 38,
maskEmojiPresentation = 1L << 39,
maskEmojiModifier = 1L << 40,
maskEmojiModifierBase = 1L << 41,
maskEmojiComponent = 1L << 42,
maskExtendedPictographic = 1L << 43;

// Can compare masked values with these to determine
// numeric or lexical types.
Expand Down Expand Up @@ -304,7 +319,7 @@ static void FAIL(String s) {
* @see GenerateCharacter#buildOne
*/

static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList, EmojiData emojiData)
{
long[] result = new long[bLatin1 ? 256 : 1 << 16];
int k = 0;
Expand Down Expand Up @@ -361,6 +376,9 @@ else if(data[j].codePoint > codePoint) {
addExProp(result, propList, "ID_Start", maskIDStart);
addExProp(result, propList, "ID_Continue", maskIDContinue);

// add Emoji properties to the upper 16-bit
addEmojiProps(result, emojiData);

return result;
}

Expand Down Expand Up @@ -583,6 +601,14 @@ static void addExProp(long[] map, PropList propList, String prop, long mask) {
}
}

static void addEmojiProps(long[] map, EmojiData emojiData) {
for (int cp : emojiData.codepoints()) {
var index = cp & 0xFFFF;
if (index < map.length)
map[index] |= emojiData.properties(cp);
}
}

/**
* This is the heart of the table compression strategy. The inputs are a map
* and a number of bits (size). The map is simply an array of long integer values;
Expand Down Expand Up @@ -776,6 +802,12 @@ static String replaceCommand(String x) {
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
if (x.equals("maskEmoji")) return "0x" + hex4(maskEmoji >> 32);
if (x.equals("maskEmojiPresentation")) return "0x" + hex4(maskEmojiPresentation >> 32);
if (x.equals("maskEmojiModifier")) return "0x" + hex4(maskEmojiModifier >> 32);
if (x.equals("maskEmojiModifierBase")) return "0x" + hex4(maskEmojiModifierBase >> 32);
if (x.equals("maskEmojiComponent")) return "0x" + hex4(maskEmojiComponent >> 32);
if (x.equals("maskExtendedPictographic")) return "0x" + hex4(maskExtendedPictographic >> 32);
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
Expand Down Expand Up @@ -952,7 +984,7 @@ static String genTables() {

// If we ever need more than 32 bits to represent the character properties,
// then a table "B" may be needed as well.
genTable(result, "B", tables[n - 1], 32, 8, sizes[n - 1], false, 0, true, true, false);
genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);

totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
result.append(commentStart);
Expand Down Expand Up @@ -1434,6 +1466,42 @@ static void propertiesComments(StringBuffer result, long val) {
result.append(", supradecimal ");
result.append((val & maskDigitOffset) >> shiftDigitOffset);
}
if ((val & maskOtherLowercase) == maskOtherLowercase) {
result.append(", otherLowercase");
}
if ((val & maskOtherUppercase) == maskOtherUppercase) {
result.append(", otherUppercase");
}
if ((val & maskOtherAlphabetic) == maskOtherAlphabetic) {
result.append(", otherAlphabetic");
}
if ((val & maskIdeographic) == maskIdeographic) {
result.append(", ideographic");
}
if ((val & maskIDStart) == maskIDStart) {
result.append(", IDStart");
}
if ((val & maskIDContinue) == maskIDContinue) {
result.append(", IDContinue");
}
if ((val & maskEmoji) == maskEmoji) {
result.append(", emoji");
}
if ((val & maskEmojiPresentation) == maskEmojiPresentation) {
result.append(", emojiPresentation");
}
if ((val & maskEmojiModifier) == maskEmojiModifier) {
result.append(", emojiModifier");
}
if ((val & maskEmojiModifierBase) == maskEmojiModifierBase) {
result.append(", emojiModifierBase");
}
if ((val & maskEmojiComponent) == maskEmojiComponent) {
result.append(", emojiComponent");
}
if ((val & maskExtendedPictographic) == maskExtendedPictographic) {
result.append(", extendedPictographic");
}
}

static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
Expand Down Expand Up @@ -1512,6 +1580,7 @@ static String genAccess(String tbl, String var, int bits) {
static String SpecialCasingFileName = null;
static String PropListFileName = null;
static String DerivedPropsFileName = null;
static String EmojiDataFileName = null;
static boolean useCharForByte = false;
static int[] sizes;
static int bins = 0; // liu; if > 0, then perform search
Expand Down Expand Up @@ -1649,6 +1718,14 @@ else if (args[j].equals("-derivedprops")) {
DerivedPropsFileName = args[++j];
}
}
else if (args[j].equals("-emojidata")) {
if (j == args.length -1) {
FAIL("File name missing after -emojidata");
}
else {
EmojiDataFileName = args[++j];
}
}
else if (args[j].equals("-plane")) {
if (j == args.length -1) {
FAIL("Plane number missing after -plane");
Expand Down Expand Up @@ -1717,6 +1794,10 @@ else if (args[j].equals("-latin1")) {
DerivedPropsFileName = DefaultDerivedPropsFileName;
desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
}
if (EmojiDataFileName == null) {
EmojiDataFileName = DefaultEmojiDataFileName;
desc.append(" [-emojidata " + EmojiDataFileName + ']');
}
if (TemplateFileName == null) {
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
: DefaultJavaTemplateFileName);
Expand Down Expand Up @@ -1871,11 +1952,12 @@ public static void main(String[] args) {
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
EmojiData emojiData = EmojiData.readSpecFile(Paths.get(EmojiDataFileName), plane);

if (verbose) {
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
}
long[] map = buildMap(data, specialCaseMaps, propList);
long[] map = buildMap(data, specialCaseMaps, propList, emojiData);
if (verbose) {
System.err.println("Completed building of initial map");
}
Expand Down

1 comment on commit f593a6b

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.