From f750d6e36c80e88fb302c99b5b7413e5649e6738 Mon Sep 17 00:00:00 2001 From: Scott Mitchell Date: Fri, 18 Dec 2015 10:53:54 -0800 Subject: [PATCH] ByteBufUtil.writeUtf8 Surrogate Support Motivation: UTF-16 can not represent the full range of Unicode characters, and thus has the concept of Surrogate Pair (http://unicode.org/glossary/#surrogate_pair) where 2 16-bit code units can be used to represent the missing characters. ByteBufUtil.writeUtf8 is currently does not support this and is thus incomplete. Modifications: - Add support for surrogate pairs in ByteBufUtil.writeUtf8 Result: ByteBufUtil.writeUtf8 now supports surrogate pairs and is correctly converting to UTF-8. --- .../java/io/netty/buffer/ByteBufUtil.java | 30 +++++++++++++++++-- .../java/io/netty/buffer/ByteBufUtilTest.java | 25 ++++++++++++---- .../java/io/netty/util/internal/MathUtil.java | 5 +--- .../io/netty/util/internal/StringUtil.java | 12 +++++++- pom.xml | 3 -- 5 files changed, 59 insertions(+), 16 deletions(-) diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java index b683b56a47c..6ceb1b06a2b 100644 --- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java +++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java @@ -38,9 +38,10 @@ import java.util.Arrays; import java.util.Locale; -import static io.netty.util.internal.StringUtil.NEWLINE; -import static io.netty.util.internal.ObjectUtil.checkNotNull; import static io.netty.util.internal.MathUtil.isOutOfBounds; +import static io.netty.util.internal.ObjectUtil.checkNotNull; +import static io.netty.util.internal.StringUtil.NEWLINE; +import static io.netty.util.internal.StringUtil.isSurrogate; /** * A collection of utility methods that is related with handling {@link ByteBuf}, @@ -397,6 +398,31 @@ private static int writeUtf8(AbstractByteBuf buffer, CharSequence seq, int len) } else if (c < 0x800) { buffer._setByte(writerIndex++, (byte) (0xc0 | (c >> 6))); buffer._setByte(writerIndex++, (byte) (0x80 | (c & 0x3f))); + } else if (isSurrogate(c)) { + if (!Character.isHighSurrogate(c)) { + throw new IllegalArgumentException("Invalid encoding. " + + "Expected high (leading) surrogate at index " + i + " but got " + c); + } + final char c2; + try { + // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid + // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will + // re-throw a more informative exception describing the problem. + c2 = seq.charAt(++i); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException("Underflow. " + + "Expected low (trailing) surrogate at index " + i + " but no more characters found.", e); + } + if (!Character.isLowSurrogate(c2)) { + throw new IllegalArgumentException("Invalid encoding. " + + "Expected low (trailing) surrogate at index " + i + " but got " + c2); + } + int codePoint = Character.toCodePoint(c, c2); + // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630. + buffer._setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18))); + buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f))); + buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f))); + buffer._setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f))); } else { buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12))); buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f))); diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java index a401fd1a231..0309db57ec3 100644 --- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java +++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java @@ -15,19 +15,17 @@ */ package io.netty.buffer; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - import io.netty.util.AsciiString; import io.netty.util.CharsetUtil; import io.netty.util.ReferenceCountUtil; - -import java.util.Random; - import org.junit.Assert; import org.junit.Test; import java.nio.charset.Charset; +import java.util.Random; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; public class ByteBufUtilTest { @Test @@ -131,6 +129,21 @@ public void testWriteUtf8() { Assert.assertEquals(buf, buf2); } + @Test + public void testWriteUtf8Surrogates() { + // leading surrogate + trailing surrogate + String surrogateString = new StringBuilder(2) + .append('\uD800') + .append('\uDC00') + .toString(); + ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16)); + buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8)); + ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16)); + ByteBufUtil.writeUtf8(buf2, surrogateString); + + Assert.assertEquals(buf, buf2); + } + @Test public void testWriteUsAsciiString() { AsciiString usAscii = new AsciiString("NettyRocks"); diff --git a/common/src/main/java/io/netty/util/internal/MathUtil.java b/common/src/main/java/io/netty/util/internal/MathUtil.java index 2be73c4b0d7..2b16e22d7ce 100644 --- a/common/src/main/java/io/netty/util/internal/MathUtil.java +++ b/common/src/main/java/io/netty/util/internal/MathUtil.java @@ -60,9 +60,6 @@ public static boolean isOutOfBounds(int index, int length, int capacity) { * */ public static int compare(long x, long y) { - if (PlatformDependent.javaVersion() < 7) { - return (x < y) ? -1 : (x > y) ? 1 : 0; - } - return Long.compare(x, y); + return (x < y) ? -1 : (x > y) ? 1 : 0; } } diff --git a/common/src/main/java/io/netty/util/internal/StringUtil.java b/common/src/main/java/io/netty/util/internal/StringUtil.java index f5ca73de7a7..6ec86c84cab 100644 --- a/common/src/main/java/io/netty/util/internal/StringUtil.java +++ b/common/src/main/java/io/netty/util/internal/StringUtil.java @@ -15,7 +15,6 @@ */ package io.netty.util.internal; - import java.io.IOException; import java.util.ArrayList; import java.util.Formatter; @@ -391,6 +390,17 @@ public static boolean isNullOrEmpty(String s) { return s == null || s.isEmpty(); } + /** + * Determine if {@code c} lies within the range of values defined for + * Surrogate Code Point. + * @param c the character to check. + * @return {@code true} if {@code c} lies within the range of values defined for + * Surrogate Code Point. {@code false} otherwise. + */ + public static boolean isSurrogate(char c) { + return c >= '\uD800' && c <= '\uDFFF'; + } + private static boolean isDoubleQuote(char c) { return c == DOUBLE_QUOTE; } diff --git a/pom.xml b/pom.xml index 633d933be5a..a296705e3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -1092,9 +1092,6 @@ java.security.AlgorithmConstraints java.util.concurrent.ConcurrentLinkedDeque - - - java.lang.Long