Skip to content

Commit

Permalink
ByteBufUtil.writeUtf8 Surrogate Support
Browse files Browse the repository at this point in the history
Motivation:
UTF-16 can not represent the full range of Unicode characters, and thus has the concept of Surrogate Pair (http://unicode.org/glossary/#surrogate_pair) where 2 16-bit code units can be used to represent the missing characters. ByteBufUtil.writeUtf8 is currently does not support this and is thus incomplete.

Modifications:
- Add support for surrogate pairs in ByteBufUtil.writeUtf8

Result:
ByteBufUtil.writeUtf8 now supports surrogate pairs and is correctly converting to UTF-8.
  • Loading branch information
Scottmitch committed Dec 18, 2015
1 parent 693633e commit f750d6e
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 16 deletions.
30 changes: 28 additions & 2 deletions buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
Expand Up @@ -38,9 +38,10 @@
import java.util.Arrays;
import java.util.Locale;

import static io.netty.util.internal.StringUtil.NEWLINE;
import static io.netty.util.internal.ObjectUtil.checkNotNull;
import static io.netty.util.internal.MathUtil.isOutOfBounds;
import static io.netty.util.internal.ObjectUtil.checkNotNull;
import static io.netty.util.internal.StringUtil.NEWLINE;
import static io.netty.util.internal.StringUtil.isSurrogate;

/**
* A collection of utility methods that is related with handling {@link ByteBuf},
Expand Down Expand Up @@ -397,6 +398,31 @@ private static int writeUtf8(AbstractByteBuf buffer, CharSequence seq, int len)
} else if (c < 0x800) {
buffer._setByte(writerIndex++, (byte) (0xc0 | (c >> 6)));
buffer._setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
} else if (isSurrogate(c)) {
if (!Character.isHighSurrogate(c)) {
throw new IllegalArgumentException("Invalid encoding. " +
"Expected high (leading) surrogate at index " + i + " but got " + c);
}
final char c2;
try {
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
// re-throw a more informative exception describing the problem.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException e) {
throw new IllegalArgumentException("Underflow. " +
"Expected low (trailing) surrogate at index " + i + " but no more characters found.", e);
}
if (!Character.isLowSurrogate(c2)) {
throw new IllegalArgumentException("Invalid encoding. " +
"Expected low (trailing) surrogate at index " + i + " but got " + c2);
}
int codePoint = Character.toCodePoint(c, c2);
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
buffer._setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18)));
buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f)));
buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f)));
buffer._setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f)));
} else {
buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
Expand Down
25 changes: 19 additions & 6 deletions buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
Expand Up @@ -15,19 +15,17 @@
*/
package io.netty.buffer;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import io.netty.util.AsciiString;
import io.netty.util.CharsetUtil;
import io.netty.util.ReferenceCountUtil;

import java.util.Random;

import org.junit.Assert;
import org.junit.Test;

import java.nio.charset.Charset;
import java.util.Random;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class ByteBufUtilTest {
@Test
Expand Down Expand Up @@ -131,6 +129,21 @@ public void testWriteUtf8() {
Assert.assertEquals(buf, buf2);
}

@Test
public void testWriteUtf8Surrogates() {
// leading surrogate + trailing surrogate
String surrogateString = new StringBuilder(2)
.append('\uD800')
.append('\uDC00')
.toString();
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);

Assert.assertEquals(buf, buf2);
}

@Test
public void testWriteUsAsciiString() {
AsciiString usAscii = new AsciiString("NettyRocks");
Expand Down
5 changes: 1 addition & 4 deletions common/src/main/java/io/netty/util/internal/MathUtil.java
Expand Up @@ -60,9 +60,6 @@ public static boolean isOutOfBounds(int index, int length, int capacity) {
* </ul>
*/
public static int compare(long x, long y) {
if (PlatformDependent.javaVersion() < 7) {
return (x < y) ? -1 : (x > y) ? 1 : 0;
}
return Long.compare(x, y);
return (x < y) ? -1 : (x > y) ? 1 : 0;
}
}
12 changes: 11 additions & 1 deletion common/src/main/java/io/netty/util/internal/StringUtil.java
Expand Up @@ -15,7 +15,6 @@
*/
package io.netty.util.internal;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Formatter;
Expand Down Expand Up @@ -391,6 +390,17 @@ public static boolean isNullOrEmpty(String s) {
return s == null || s.isEmpty();
}

/**
* Determine if {@code c} lies within the range of values defined for
* <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>.
* @param c the character to check.
* @return {@code true} if {@code c} lies within the range of values defined for
* <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>. {@code false} otherwise.
*/
public static boolean isSurrogate(char c) {
return c >= '\uD800' && c <= '\uDFFF';
}

private static boolean isDoubleQuote(char c) {
return c == DOUBLE_QUOTE;
}
Expand Down
3 changes: 0 additions & 3 deletions pom.xml
Expand Up @@ -1092,9 +1092,6 @@
<ignore>java.security.AlgorithmConstraints</ignore>

<ignore>java.util.concurrent.ConcurrentLinkedDeque</ignore>

<!-- Used in internal utilities (protected by conditional) -->
<ignore>java.lang.Long</ignore>
</ignores>
</configuration>
<executions>
Expand Down

0 comments on commit f750d6e

Please sign in to comment.