Skip to content
Permalink
Browse files
Utf 8 encoding optimizations (#1444)
* fix: Correct typo weather to whether

* misc: Change internal Encoding.testAsciiNumbers(...) to be static

* perf: Enhance Encoding constructor to allow skipping of ASCII number compatibility test

Adds a two parameter constructor to Encoding to allow sub classes to specify whether their
known ASCII compatability so as to skip testing. The only usage of it is the UTF8Encoding
which is changed to use the new constructor.

* fix: limit size of char[] for utf-8 decoding

Also optimize for java 9+ byte[] backed strings

* fix: limit size of char[] for utf-8 decoding

address style issues

* fix: limit size of char[] for utf-8 decoding

address style issues

* fix: limit size of char[] for utf-8 decoding

use existing JavaVersion enum to pick implementation
add unit test string values
more consistency between byte and char based implementations

* fix: limit size of char[] for utf-8 decoding

fix backwards comparison
add more strings to unit test

* fix: limit size of char[] for utf-8 decoding

Move to using new String(byte[], int, int, Charset) rather than custom
decoding for jre newer than 1.8.

* fix: limit size of char[] for utf-8 decoding

Back to custom utf-8 decoding for performance gains while validating

* javadoc

* put test back into test suite

* avoid creating an unnecessary `char[]` when growing cached array

Co-authored-by: Sehrope Sarkuni <sehrope@jackdb.com>
Co-authored-by: Dave Cramer <davecramer@gmail.com>
  • Loading branch information
3 people committed Jan 16, 2020
1 parent 2972add commit c84e62efa5b98323562753e45fbf0d974eaca483
@@ -0,0 +1,51 @@
/*
* Copyright (c) 2019, PostgreSQL Global Development Group
* See the LICENSE file in the project root for more information.
*/

package org.postgresql.core;

import java.io.IOException;
import java.nio.charset.Charset;

/**
* UTF-8 encoder which validates input and is optimized for jdk 9+ where {@code String} objects are backed by
* {@code byte[]}.
* @author Brett Okken
*/
final class ByteOptimizedUTF8Encoder extends OptimizedUTF8Encoder {

private static final Charset ASCII_CHARSET = Charset.forName("ascii");

/**
* {@inheritDoc}
*/
@Override
public String decode(byte[] encodedString, int offset, int length) throws IOException {
//for very short strings going straight to chars is up to 30% faster
if (length <= 32) {
return charDecode(encodedString, offset, length);
}
for (int i = offset, j = offset + length; i < j; ++i) {
// bytes are signed values. all ascii values are positive
if (encodedString[i] < 0) {
return slowDecode(encodedString, offset, length, i);
}
}
// we have confirmed all chars are ascii, give java that hint
return new String(encodedString, offset, length, ASCII_CHARSET);
}

/**
* Decodes to {@code char[]} in presence of non-ascii values after first copying all known ascii chars directly
* from {@code byte[]} to {@code char[]}.
*/
private synchronized String slowDecode(byte[] encodedString, int offset, int length, int curIdx) throws IOException {
final char[] chars = getCharArray(length);
int out = 0;
for (int i = offset; i < curIdx; ++i) {
chars[out++] = (char) encodedString[i];
}
return decodeToChars(encodedString, curIdx, length - (curIdx - offset), chars, out);
}
}
@@ -0,0 +1,24 @@
/*
* Copyright (c) 2019, PostgreSQL Global Development Group
* See the LICENSE file in the project root for more information.
*/

package org.postgresql.core;

import java.io.IOException;

/**
* UTF-8 encoder which validates input and is optimized for jdk 8 and lower where {@code String} objects are backed by
* {@code char[]}.
* @author Brett Okken
*/
final class CharOptimizedUTF8Encoder extends OptimizedUTF8Encoder {

/**
* {@inheritDoc}
*/
@Override
public String decode(byte[] encodedString, int offset, int length) throws IOException {
return charDecode(encodedString, offset, length);
}
}
@@ -25,7 +25,6 @@ public class Encoding {
private static final Logger LOGGER = Logger.getLogger(Encoding.class.getName());

private static final Encoding DEFAULT_ENCODING = new Encoding();
private static final Encoding UTF8_ENCODING = new Encoding("UTF-8");

/*
* Preferred JVM encodings for backend encodings.
@@ -76,24 +75,51 @@ public class Encoding {
encodings.put("LATIN10", new String[0]);
}

private final String encoding;
private interface UTFEncodingProvider {
Encoding getEncoding();
}

private static final UTFEncodingProvider UTF_ENCODING_PROVIDER;

static {
//for java 1.8 and older, use implementation optimized for char[]
final JavaVersion runtimeVersion = JavaVersion.getRuntimeVersion();
if (JavaVersion.v1_8.compareTo(runtimeVersion) >= 0) {
UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
@Override
public Encoding getEncoding() {
return new CharOptimizedUTF8Encoder();
}
};
} else {
//for newer versions, use default java behavior
UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
@Override
public Encoding getEncoding() {
return new ByteOptimizedUTF8Encoder();
}
};
}
}

private final Charset encoding;
private final boolean fastASCIINumbers;

/**
* Uses the default charset of the JVM.
*/
private Encoding() {
this(Charset.defaultCharset().name());
this(Charset.defaultCharset());
}

/**
* Subclasses may use this constructor if they know in advance of their ASCII number
* compatibility.
*
* @param encoding charset name to use
* @param encoding charset to use
* @param fastASCIINumbers whether this encoding is compatible with ASCII numbers.
*/
protected Encoding(String encoding, boolean fastASCIINumbers) {
protected Encoding(Charset encoding, boolean fastASCIINumbers) {
if (encoding == null) {
throw new NullPointerException("Null encoding charset not supported");
}
@@ -109,9 +135,9 @@ protected Encoding(String encoding, boolean fastASCIINumbers) {
* Use the charset passed as parameter and tests at creation time whether the specified encoding
* is compatible with ASCII numbers.
*
* @param encoding charset name to use
* @param encoding charset to use
*/
protected Encoding(String encoding) {
protected Encoding(Charset encoding) {
this(encoding, testAsciiNumbers(encoding));
}

@@ -134,13 +160,12 @@ public boolean hasAsciiNumbers() {
*/
public static Encoding getJVMEncoding(String jvmEncoding) {
if ("UTF-8".equals(jvmEncoding)) {
return new UTF8Encoding();
return UTF_ENCODING_PROVIDER.getEncoding();
}
if (Charset.isSupported(jvmEncoding)) {
return new Encoding(jvmEncoding);
} else {
return DEFAULT_ENCODING;
return new Encoding(Charset.forName(jvmEncoding));
}
return DEFAULT_ENCODING;
}

/**
@@ -152,7 +177,7 @@ public static Encoding getJVMEncoding(String jvmEncoding) {
*/
public static Encoding getDatabaseEncoding(String databaseEncoding) {
if ("UTF8".equals(databaseEncoding)) {
return UTF8_ENCODING;
return UTF_ENCODING_PROVIDER.getEncoding();
}
// If the backend encoding is known and there is a suitable
// encoding in the JVM we use that. Otherwise we fall back
@@ -162,15 +187,15 @@ public static Encoding getDatabaseEncoding(String databaseEncoding) {
for (String candidate : candidates) {
LOGGER.log(Level.FINEST, "Search encoding candidate {0}", candidate);
if (Charset.isSupported(candidate)) {
return new Encoding(candidate);
return new Encoding(Charset.forName(candidate));
}
}
}

// Try the encoding name directly -- maybe the charset has been
// provided by the user.
if (Charset.isSupported(databaseEncoding)) {
return new Encoding(databaseEncoding);
return new Encoding(Charset.forName(databaseEncoding));
}

// Fall back to default JVM encoding.
@@ -184,7 +209,7 @@ public static Encoding getDatabaseEncoding(String databaseEncoding) {
* @return the JVM encoding name used by this instance.
*/
public String name() {
return Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding;
return encoding.name();
}

/**
@@ -258,8 +283,9 @@ public static Encoding defaultEncoding() {
return DEFAULT_ENCODING;
}

@Override
public String toString() {
return encoding;
return encoding.name();
}

/**
@@ -268,7 +294,7 @@ public String toString() {
*
* @return If faster ASCII number parsing can be used with this encoding.
*/
private static boolean testAsciiNumbers(String encoding) {
private static boolean testAsciiNumbers(Charset encoding) {
// TODO: test all postgres supported encoding to see if there are
// any which do _not_ have ascii numbers in same location
// at least all the encoding listed in the encodings hashmap have

0 comments on commit c84e62e

Please sign in to comment.