From 80ef37b2190a835a19590b52957f248fec4bae02 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 17:24:12 +0100 Subject: [PATCH 01/25] First stab --- .../share/classes/java/lang/String.java | 508 ++++++++++++++++- .../share/classes/java/lang/StringCoding.java | 521 +----------------- .../share/classes/java/lang/System.java | 4 +- 3 files changed, 503 insertions(+), 530 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index e96943ffe1327..ff95f419d8153 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -31,7 +31,9 @@ import java.lang.invoke.MethodHandles; import java.lang.constant.Constable; import java.lang.constant.ConstantDesc; -import java.nio.charset.Charset; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -51,7 +53,11 @@ import java.util.stream.StreamSupport; import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.Stable; +import sun.nio.cs.ArrayDecoder; +import static java.lang.Character.*; +import static java.lang.Character.lowSurrogate; +import static java.lang.StringUTF16.putChar; import static java.util.function.Predicate.not; /** @@ -217,6 +223,12 @@ public final class String COMPACT_STRINGS = true; } + private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; + private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; + private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; + + private static final char REPL = '\ufffd'; + /** * Class String is special cased within the Serialization Stream Protocol. * @@ -475,15 +487,9 @@ public String(byte ascii[], int hibyte) { * * @since 1.1 */ - public String(byte bytes[], int offset, int length, String charsetName) + public String(byte[] bytes, int offset, int length, String charsetName) throws UnsupportedEncodingException { - if (charsetName == null) - throw new NullPointerException("charsetName"); - checkBoundsOffCount(offset, length, bytes.length); - StringCoding.Result ret = - StringCoding.decode(charsetName, bytes, offset, length); - this.value = ret.value; - this.coder = ret.coder; + this(bytes, offset, length, StringCoding.lookupCharset(Objects.requireNonNull(charsetName))); } /** @@ -516,14 +522,479 @@ public String(byte bytes[], int offset, int length, String charsetName) * * @since 1.6 */ - public String(byte bytes[], int offset, int length, Charset charset) { - if (charset == null) - throw new NullPointerException("charset"); + public String(byte[] bytes, int offset, int length, Charset charset) { + Objects.requireNonNull(charset); + checkBoundsOffCount(offset, length, bytes.length); + if (charset == UTF_8) { + if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + } else { + int sl = offset + length; + int dp = 0; + byte[] dst = new byte[length]; + + if (COMPACT_STRINGS) { + while (offset < sl) { + int b1 = bytes[offset]; + if (b1 >= 0) { + dst[dp++] = (byte)b1; + offset++; + continue; + } + if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && + offset + 1 < sl) { + int b2 = bytes[offset + 1]; + if (!StringCoding.isNotContinuation(b2)) { + dst[dp++] = (byte)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + offset += 2; + continue; + } + } + // anything not a latin1, including the repl + // we have to go with the utf16 + break; + } + if (offset == sl) { + if (dp != dst.length) { + dst = Arrays.copyOf(dst, dp); + } + this.value = dst; + this.coder = LATIN1; + return; + } + } + if (dp == 0) { + dst = new byte[length << 1]; + } else { + byte[] buf = new byte[length << 1]; + StringLatin1.inflate(dst, 0, buf, 0, dp); + dst = buf; + } + while (offset < sl) { + int b1 = bytes[offset++]; + if (b1 >= 0) { + putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (offset < sl) { + int b2 = bytes[offset++]; + if (StringCoding.isNotContinuation(b2)) { + putChar(dst, dp++, REPL); + offset--; + } else { + putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0)))); + } + continue; + } + putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 4) == -2) { + if (offset + 1 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + if (StringCoding.isMalformed3(b1, b2, b3)) { + putChar(dst, dp++, REPL); + offset -= 3; + offset += StringCoding.malformedN(bytes, offset, 3); + } else { + char c = (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (isSurrogate(c)) { + putChar(dst, dp++, REPL); + } else { + putChar(dst, dp++, c); + } + } + continue; + } + if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { + putChar(dst, dp++, REPL); + continue; + } + putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 3) == -2) { + if (offset + 2 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + int b4 = bytes[offset++]; + int uc = ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (StringCoding.isMalformed4(b2, b3, b4) || + !isSupplementaryCodePoint(uc)) { // shortest form check + putChar(dst, dp++, REPL); + offset -= 4; + offset += StringCoding.malformedN(bytes, offset, 4); + } else { + putChar(dst, dp++, highSurrogate(uc)); + putChar(dst, dp++, lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || + offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { + putChar(dst, dp++, REPL); + continue; + } + offset++; + putChar(dst, dp++, REPL); + if (offset < sl && StringCoding.isMalformed4_3(bytes[offset])) { + continue; + } + break; + } else { + putChar(dst, dp++, REPL); + } + } + if (dp != length) { + dst = Arrays.copyOf(dst, dp << 1); + } + this.value = dst; + this.coder = UTF16; + } + } else if (charset == ISO_8859_1) { + if (COMPACT_STRINGS) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + } else { + this.value = StringLatin1.inflate(bytes, offset, length); + this.coder = UTF16; + } + } else if (charset == US_ASCII) { + if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + } else { + byte[] dst = new byte[length << 1]; + int dp = 0; + while (dp < length) { + int b = bytes[offset++]; + putChar(dst, dp++, (b >= 0) ? (char) b : REPL); + } + this.value = dst; + this.coder = UTF16; + } + } else { + // (1)We never cache the "external" cs, the only benefit of creating + // an additional StringDe/Encoder object to wrap it is to share the + // de/encode() method. These SD/E objects are short-lived, the young-gen + // gc should be able to take care of them well. But the best approach + // is still not to generate them if not really necessary. + // (2)The defensive copy of the input byte/char[] has a big performance + // impact, as well as the outgoing result byte/char[]. Need to do the + // optimization check of (sm==null && classLoader0==null) for both. + // (3)There might be a timing gap in isTrusted setting. getClassLoader0() + // is only checked (and then isTrusted gets set) when (SM==null). It is + // possible that the SM==null for now but then SM is NOT null later + // when safeTrim() is invoked...the "safe" way to do is to redundant + // check (... && (isTrusted || SM == null || getClassLoader0())) in trim + // but it then can be argued that the SM is null when the operation + // is started... + CharsetDecoder cd = charset.newDecoder(); + // ascii fastpath + if ((cd instanceof ArrayDecoder) && + ((ArrayDecoder)cd).isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { + if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + } else { + byte[] dst = new byte[length << 1]; + int dp = 0; + while (dp < length) { + int b = bytes[offset++]; + putChar(dst, dp++, (b >= 0) ? (char) b : REPL); + } + this.value = dst; + this.coder = UTF16; + } + return; + } + // fastpath for always Latin1 decodable single byte + if (COMPACT_STRINGS && cd instanceof ArrayDecoder && ((ArrayDecoder)cd).isLatin1Decodable()) { + byte[] dst = new byte[length]; + ((ArrayDecoder)cd).decodeToLatin1(bytes, offset, length, dst); + this.value = dst; + this.coder = LATIN1; + return; + } + + int en = StringCoding.scale(length, cd.maxCharsPerByte()); + if (length == 0) { + this.value = "".value; + this.coder = "".coder; + return; + } + cd.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .reset(); + char[] ca = new char[en]; + if (cd instanceof ArrayDecoder) { + int clen = ((ArrayDecoder)cd).decode(bytes, offset, length, ca); + if (COMPACT_STRINGS) { + byte[] bs = StringUTF16.compress(ca, 0, clen); + if (bs != null) { + value = bs; + coder = LATIN1; + return; + } + } + coder = UTF16; + value = StringUTF16.toBytes(ca, 0, clen); + return; + } + if (charset.getClass().getClassLoader0() != null && + System.getSecurityManager() != null) { + bytes = Arrays.copyOfRange(bytes, offset, offset + length); + offset = 0; + } + ByteBuffer bb = ByteBuffer.wrap(bytes, offset, length); + CharBuffer cb = CharBuffer.wrap(ca); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + if (COMPACT_STRINGS) { + byte[] bs = StringUTF16.compress(ca, 0, cb.position()); + if (bs != null) { + value = bs; + coder = LATIN1; + return; + } + } + coder = UTF16; + value = StringUTF16.toBytes(ca, 0, cb.position()); + } + } + + ////////////////////// for j.u.z.ZipCoder ////////////////////////// + + /* + * Throws iae, instead of replacing, if malformed or unmappable. + */ + static String newStringUTF8NoRepl(byte[] src, int off, int len) { + return new String(src, off, len, (Void)null); + } + + static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { + try { + return newStringNoRepl1(src, cs); + } catch (IllegalArgumentException e) { + //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause + Throwable cause = e.getCause(); + if (cause instanceof MalformedInputException) { + throw (MalformedInputException)cause; + } + throw (CharacterCodingException)cause; + } + } + + static String newStringNoRepl1(byte[] src, Charset cs) { + if (cs == UTF_8) { + return newStringUTF8NoRepl(src, 0, src.length); + } + if (cs == ISO_8859_1) { + return new String(src, 0, src.length, ISO_8859_1); + } + if (cs == US_ASCII) { + if (!StringCoding.hasNegatives(src, 0, src.length)) { + return new String(src, 0, src.length, ISO_8859_1); + } else { + StringCoding.throwMalformed(src); + } + } + + CharsetDecoder cd = cs.newDecoder(); + // ascii fastpath + if ((cd instanceof ArrayDecoder) && + ((ArrayDecoder)cd).isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { + return new String(src, 0, src.length, ISO_8859_1); + } + int len = src.length; + if (len == 0) { + return ""; + } + int en = StringCoding.scale(len, cd.maxCharsPerByte()); + char[] ca = new char[en]; + if (cs.getClass().getClassLoader0() != null && + System.getSecurityManager() != null) { + src = Arrays.copyOf(src, len); + } + ByteBuffer bb = ByteBuffer.wrap(src); + CharBuffer cb = CharBuffer.wrap(ca); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + throw new IllegalArgumentException(x); + } + StringCoding.Result ret = new StringCoding.Result().with(ca, 0, cb.position()); + return new String(ret.value, ret.coder); + } + + /* + * Private constructor for doing UTF-8 decode, but throwing iae on malformed or + * unmappable characters + */ + private String(byte[] bytes, int offset, int length, Void throwOnError) { checkBoundsOffCount(offset, length, bytes.length); - StringCoding.Result ret = - StringCoding.decode(charset, bytes, offset, length); - this.value = ret.value; - this.coder = ret.coder; + if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + } else { + int sl = offset + length; + int dp = 0; + byte[] dst = new byte[length]; + if (COMPACT_STRINGS) { + while (offset < sl) { + int b1 = bytes[offset]; + if (b1 >= 0) { + dst[dp++] = (byte)b1; + offset++; + continue; + } + if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && + offset + 1 < sl) { + int b2 = bytes[offset + 1]; + if (!StringCoding.isNotContinuation(b2)) { + dst[dp++] = (byte)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + offset += 2; + continue; + } + } + // anything not a latin1, including the repl + // we have to go with the utf16 + break; + } + if (offset == sl) { + if (dp != dst.length) { + dst = Arrays.copyOf(dst, dp); + } + this.value = dst; + this.coder = LATIN1; + return; + } + } + if (dp == 0) { + dst = new byte[length << 1]; + } else { + byte[] buf = new byte[length << 1]; + StringLatin1.inflate(dst, 0, buf, 0, dp); + dst = buf; + } + while (offset < sl) { + int b1 = bytes[offset++]; + if (b1 >= 0) { + putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (offset < sl) { + int b2 = bytes[offset++]; + if (StringCoding.isNotContinuation(b2)) { + StringCoding.throwMalformed(offset - 1, 1); + } else { + putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0)))); + } + continue; + } + StringCoding.throwMalformed(offset, 1); // underflow() + break; + } else if ((b1 >> 4) == -2) { + if (offset + 1 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + if (StringCoding.isMalformed3(b1, b2, b3)) { + StringCoding.throwMalformed(offset - 3, 3); + } else { + char c = (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (isSurrogate(c)) { + StringCoding.throwMalformed(offset - 3, 3); + } else { + putChar(dst, dp++, c); + } + } + continue; + } + if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { + StringCoding.throwMalformed(offset - 1, 2); + continue; + } + StringCoding.throwMalformed(offset, 1); + break; + } else if ((b1 >> 3) == -2) { + if (offset + 2 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + int b4 = bytes[offset++]; + int uc = ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (StringCoding.isMalformed4(b2, b3, b4) || + !isSupplementaryCodePoint(uc)) { // shortest form check + StringCoding.throwMalformed(offset - 4, 4); + } else { + putChar(dst, dp++, highSurrogate(uc)); + putChar(dst, dp++, lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || + offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { + StringCoding.throwMalformed(offset - 1, 1); // or 2 + continue; + } + StringCoding.throwMalformed(offset - 1, 1); + break; + } else { + StringCoding.throwMalformed(offset - 1, 1); + } + } + if (dp != length) { + dst = Arrays.copyOf(dst, dp << 1); + } + this.value = dst; + this.coder = UTF16; + } } /** @@ -605,10 +1076,7 @@ public String(byte bytes[], Charset charset) { * @since 1.1 */ public String(byte bytes[], int offset, int length) { - checkBoundsOffCount(offset, length, bytes.length); - StringCoding.Result ret = StringCoding.decode(bytes, offset, length); - this.value = ret.value; - this.coder = ret.coder; + this(bytes, offset, length, Charset.defaultCharset()); } /** diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 8c0911b169acf..051404bf6f3f7 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -63,8 +63,6 @@ class StringCoding { private StringCoding() { } /** The cached coders for each thread */ - private static final ThreadLocal> decoder = - new ThreadLocal<>(); private static final ThreadLocal> encoder = new ThreadLocal<>(); @@ -91,13 +89,13 @@ private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { return Arrays.copyOf(ba, len); } - private static int scale(int len, float expansionFactor) { + static int scale(int len, float expansionFactor) { // We need to perform double, not float, arithmetic; otherwise // we lose low order bits when len is larger than 2**24. return (int)(len * (double)expansionFactor); } - private static Charset lookupCharset(String csn) { + static Charset lookupCharset(String csn) { if (Charset.isSupported(csn)) { try { return Charset.forName(csn); @@ -112,12 +110,6 @@ static class Result { byte[] value; byte coder; - Result with() { - coder = COMPACT_STRINGS ? LATIN1 : UTF16; - value = new byte[0]; - return this; - } - Result with(char[] val, int off, int len) { if (String.COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(val, off, len); @@ -149,201 +141,6 @@ public static boolean hasNegatives(byte[] ba, int off, int len) { return false; } - // -- Decoding -- - static class StringDecoder { - private final String requestedCharsetName; - private final Charset cs; - private final boolean isASCIICompatible; - private final CharsetDecoder cd; - protected final Result result; - - StringDecoder(Charset cs, String rcn) { - this.requestedCharsetName = rcn; - this.cs = cs; - this.cd = cs.newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - this.result = new Result(); - this.isASCIICompatible = (cd instanceof ArrayDecoder) && - ((ArrayDecoder)cd).isASCIICompatible(); - } - - String charsetName() { - if (cs instanceof HistoricallyNamedCharset) - return ((HistoricallyNamedCharset)cs).historicalName(); - return cs.name(); - } - - final String requestedCharsetName() { - return requestedCharsetName; - } - - Result decode(byte[] ba, int off, int len) { - if (len == 0) { - return result.with(); - } - // fastpath for ascii compatible - if (isASCIICompatible && !hasNegatives(ba, off, len)) { - if (COMPACT_STRINGS) { - return result.with(Arrays.copyOfRange(ba, off, off + len), - LATIN1); - } else { - return result.with(StringLatin1.inflate(ba, off, len), UTF16); - } - } - // fastpath for always Latin1 decodable single byte - if (COMPACT_STRINGS && cd instanceof ArrayDecoder && ((ArrayDecoder)cd).isLatin1Decodable()) { - byte[] dst = new byte[len]; - ((ArrayDecoder)cd).decodeToLatin1(ba, off, len, dst); - return result.with(dst, LATIN1); - } - int en = scale(len, cd.maxCharsPerByte()); - char[] ca = new char[en]; - if (cd instanceof ArrayDecoder) { - int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); - return result.with(ca, 0, clen); - } - cd.reset(); - ByteBuffer bb = ByteBuffer.wrap(ba, off, len); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return result.with(ca, 0, cb.position()); - } - } - - static Result decode(String charsetName, byte[] ba, int off, int len) - throws UnsupportedEncodingException - { - StringDecoder sd = deref(decoder); - String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; - if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) - || csn.equals(sd.charsetName()))) { - sd = null; - try { - Charset cs = lookupCharset(csn); - if (cs != null) { - if (cs == UTF_8) { - return decodeUTF8(ba, off, len, true); - } - if (cs == ISO_8859_1) { - return decodeLatin1(ba, off, len); - } - if (cs == US_ASCII) { - return decodeASCII(ba, off, len); - } - sd = new StringDecoder(cs, csn); - } - } catch (IllegalCharsetNameException x) {} - if (sd == null) - throw new UnsupportedEncodingException(csn); - set(decoder, sd); - } - return sd.decode(ba, off, len); - } - - static Result decode(Charset cs, byte[] ba, int off, int len) { - if (cs == UTF_8) { - return decodeUTF8(ba, off, len, true); - } - if (cs == ISO_8859_1) { - return decodeLatin1(ba, off, len); - } - if (cs == US_ASCII) { - return decodeASCII(ba, off, len); - } - - // (1)We never cache the "external" cs, the only benefit of creating - // an additional StringDe/Encoder object to wrap it is to share the - // de/encode() method. These SD/E objects are short-lived, the young-gen - // gc should be able to take care of them well. But the best approach - // is still not to generate them if not really necessary. - // (2)The defensive copy of the input byte/char[] has a big performance - // impact, as well as the outgoing result byte/char[]. Need to do the - // optimization check of (sm==null && classLoader0==null) for both. - // (3)There might be a timing gap in isTrusted setting. getClassLoader0() - // is only checked (and then isTrusted gets set) when (SM==null). It is - // possible that the SM==null for now but then SM is NOT null later - // when safeTrim() is invoked...the "safe" way to do is to redundant - // check (... && (isTrusted || SM == null || getClassLoader0())) in trim - // but it then can be argued that the SM is null when the operation - // is started... - CharsetDecoder cd = cs.newDecoder(); - // ascii fastpath - if ((cd instanceof ArrayDecoder) && - ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { - return decodeLatin1(ba, off, len); - } - // fastpath for always Latin1 decodable single byte - if (COMPACT_STRINGS && cd instanceof ArrayDecoder && ((ArrayDecoder)cd).isLatin1Decodable()) { - byte[] dst = new byte[len]; - ((ArrayDecoder)cd).decodeToLatin1(ba, off, len, dst); - return new Result().with(dst, LATIN1); - } - - int en = scale(len, cd.maxCharsPerByte()); - if (len == 0) { - return new Result().with(); - } - cd.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE) - .reset(); - char[] ca = new char[en]; - if (cd instanceof ArrayDecoder) { - int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); - return new Result().with(ca, 0, clen); - } - if (cs.getClass().getClassLoader0() != null && - System.getSecurityManager() != null) { - ba = Arrays.copyOfRange(ba, off, off + len); - off = 0; - } - ByteBuffer bb = ByteBuffer.wrap(ba, off, len); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return new Result().with(ca, 0, cb.position()); - } - - static Result decode(byte[] ba, int off, int len) { - Charset cs = Charset.defaultCharset(); - if (cs == UTF_8) { - return decodeUTF8(ba, off, len, true); - } - if (cs == ISO_8859_1) { - return decodeLatin1(ba, off, len); - } - if (cs == US_ASCII) { - return decodeASCII(ba, off, len); - } - StringDecoder sd = deref(decoder); - if (sd == null || !cs.name().equals(sd.cs.name())) { - sd = new StringDecoder(cs, cs.name()); - set(decoder, sd); - } - return sd.decode(ba, off, len); - } - // -- Encoding -- private static class StringEncoder { private Charset cs; @@ -522,30 +319,8 @@ static byte[] encode(byte coder, byte[] val) { */ private static native void err(String msg); - /* The cached Result for each thread */ - private static final ThreadLocal - resultCached = new ThreadLocal<>() { - protected StringCoding.Result initialValue() { - return new StringCoding.Result(); - }}; - ////////////////////////// ascii ////////////////////////////// - private static Result decodeASCII(byte[] ba, int off, int len) { - Result result = resultCached.get(); - if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { - return result.with(Arrays.copyOfRange(ba, off, off + len), - LATIN1); - } - byte[] dst = new byte[len<<1]; - int dp = 0; - while (dp < len) { - int b = ba[off++]; - putChar(dst, dp++, (b >= 0) ? (char)b : repl); - } - return result.with(dst, UTF16); - } - private static byte[] encodeASCII(byte coder, byte[] val) { if (coder == LATIN1) { byte[] dst = new byte[val.length]; @@ -579,17 +354,6 @@ private static byte[] encodeASCII(byte coder, byte[] val) { return Arrays.copyOf(dst, dp); } - ////////////////////////// latin1/8859_1 /////////////////////////// - - private static Result decodeLatin1(byte[] ba, int off, int len) { - Result result = resultCached.get(); - if (COMPACT_STRINGS) { - return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); - } else { - return result.with(StringLatin1.inflate(ba, off, len), UTF16); - } - } - @IntrinsicCandidate private static int implEncodeISOArray(byte[] sa, int sp, byte[] da, int dp, int len) { @@ -641,37 +405,37 @@ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { //////////////////////////////// utf8 //////////////////////////////////// - private static boolean isNotContinuation(int b) { + static boolean isNotContinuation(int b) { return (b & 0xc0) != 0x80; } - private static boolean isMalformed3(int b1, int b2, int b3) { + static boolean isMalformed3(int b1, int b2, int b3) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; } - private static boolean isMalformed3_2(int b1, int b2) { + static boolean isMalformed3_2(int b1, int b2) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80; } - private static boolean isMalformed4(int b2, int b3, int b4) { + static boolean isMalformed4(int b2, int b3, int b4) { return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80; } - private static boolean isMalformed4_2(int b1, int b2) { + static boolean isMalformed4_2(int b1, int b2) { return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80; } - private static boolean isMalformed4_3(int b3) { + static boolean isMalformed4_3(int b3) { return (b3 & 0xc0) != 0x80; } // for nb == 3/4 - private static int malformedN(byte[] src, int sp, int nb) { + static int malformedN(byte[] src, int sp, int nb) { if (nb == 3) { int b1 = src[sp++]; int b2 = src[sp++]; // no need to lookup b3 @@ -693,206 +457,28 @@ private static int malformedN(byte[] src, int sp, int nb) { return -1; } - private static void throwMalformed(int off, int nb) { + static void throwMalformed(int off, int nb) { String msg = "malformed input off : " + off + ", length : " + nb; throw new IllegalArgumentException(msg, new MalformedInputException(nb)); } - private static void throwMalformed(byte[] val) { + static void throwMalformed(byte[] val) { int dp = 0; while (dp < val.length && val[dp] >=0) { dp++; } throwMalformed(dp, 1); } - private static void throwUnmappable(int off, int nb) { + static void throwUnmappable(int off, int nb) { String msg = "malformed input off : " + off + ", length : " + nb; throw new IllegalArgumentException(msg, new UnmappableCharacterException(nb)); } - private static void throwUnmappable(byte[] val) { + static void throwUnmappable(byte[] val) { int dp = 0; while (dp < val.length && val[dp] >=0) { dp++; } throwUnmappable(dp, 1); } - private static char repl = '\ufffd'; - - private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { - // ascii-bais, which has a relative impact to the non-ascii-only bytes - if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) - return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), - LATIN1); - return decodeUTF8_0(src, sp, len, doReplace); - } - - private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { - Result ret = resultCached.get(); - - int sl = sp + len; - int dp = 0; - byte[] dst = new byte[len]; - - if (COMPACT_STRINGS) { - while (sp < sl) { - int b1 = src[sp]; - if (b1 >= 0) { - dst[dp++] = (byte)b1; - sp++; - continue; - } - if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && - sp + 1 < sl) { - int b2 = src[sp + 1]; - if (!isNotContinuation(b2)) { - dst[dp++] = (byte)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); - sp += 2; - continue; - } - } - // anything not a latin1, including the repl - // we have to go with the utf16 - break; - } - if (sp == sl) { - if (dp != dst.length) { - dst = Arrays.copyOf(dst, dp); - } - return ret.with(dst, LATIN1); - } - } - if (dp == 0) { - dst = new byte[len << 1]; - } else { - byte[] buf = new byte[len << 1]; - StringLatin1.inflate(dst, 0, buf, 0, dp); - dst = buf; - } - while (sp < sl) { - int b1 = src[sp++]; - if (b1 >= 0) { - putChar(dst, dp++, (char) b1); - } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (sp < sl) { - int b2 = src[sp++]; - if (isNotContinuation(b2)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - putChar(dst, dp++, repl); - sp--; - } else { - putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0)))); - } - continue; - } - if (!doReplace) { - throwMalformed(sp, 1); // underflow() - } - putChar(dst, dp++, repl); - break; - } else if ((b1 >> 4) == -2) { - if (sp + 1 < sl) { - int b2 = src[sp++]; - int b3 = src[sp++]; - if (isMalformed3(b1, b2, b3)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } - putChar(dst, dp++, repl); - sp -= 3; - sp += malformedN(src, sp, 3); - } else { - char c = (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (isSurrogate(c)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } - putChar(dst, dp++, repl); - } else { - putChar(dst, dp++, c); - } - } - continue; - } - if (sp < sl && isMalformed3_2(b1, src[sp])) { - if (!doReplace) { - throwMalformed(sp - 1, 2); - } - putChar(dst, dp++, repl); - continue; - } - if (!doReplace){ - throwMalformed(sp, 1); - } - putChar(dst, dp++, repl); - break; - } else if ((b1 >> 3) == -2) { - if (sp + 2 < sl) { - int b2 = src[sp++]; - int b3 = src[sp++]; - int b4 = src[sp++]; - int uc = ((b1 << 18) ^ - (b2 << 12) ^ - (b3 << 6) ^ - (b4 ^ - (((byte) 0xF0 << 18) ^ - ((byte) 0x80 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (isMalformed4(b2, b3, b4) || - !isSupplementaryCodePoint(uc)) { // shortest form check - if (!doReplace) { - throwMalformed(sp - 4, 4); - } - putChar(dst, dp++, repl); - sp -= 4; - sp += malformedN(src, sp, 4); - } else { - putChar(dst, dp++, highSurrogate(uc)); - putChar(dst, dp++, lowSurrogate(uc)); - } - continue; - } - b1 &= 0xff; - if (b1 > 0xf4 || - sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); // or 2 - } - putChar(dst, dp++, repl); - continue; - } - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - sp++; - putChar(dst, dp++, repl); - if (sp < sl && isMalformed4_3(src[sp])) { - continue; - } - break; - } else { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - putChar(dst, dp++, repl); - } - } - if (dp != len) { - dst = Arrays.copyOf(dst, dp << 1); - } - return ret.with(dst, UTF16); - } - private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { if (coder == UTF16) return encodeUTF8_UTF16(val, doReplace); @@ -967,18 +553,6 @@ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { return Arrays.copyOf(dst, dp); } - ////////////////////// for j.u.z.ZipCoder ////////////////////////// - - /* - * Throws iae, instead of replacing, if malformed or unmappable. - */ - static String newStringUTF8NoRepl(byte[] src, int off, int len) { - if (COMPACT_STRINGS && !hasNegatives(src, off, len)) - return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); - Result ret = decodeUTF8_0(src, off, len, false); - return new String(ret.value, ret.coder); - } - /* * Throws iae, instead of replacing, if unmappable. */ @@ -992,75 +566,6 @@ private static boolean isASCII(byte[] src) { return !hasNegatives(src, 0, src.length); } - private static String newStringLatin1(byte[] src) { - if (COMPACT_STRINGS) - return new String(src, LATIN1); - return new String(StringLatin1.inflate(src, 0, src.length), UTF16); - } - - static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { - try { - return newStringNoRepl1(src, cs); - } catch (IllegalArgumentException e) { - //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause - Throwable cause = e.getCause(); - if (cause instanceof MalformedInputException) { - throw (MalformedInputException)cause; - } - throw (CharacterCodingException)cause; - } - } - - static String newStringNoRepl1(byte[] src, Charset cs) { - if (cs == UTF_8) { - if (COMPACT_STRINGS && isASCII(src)) - return new String(src, LATIN1); - Result ret = decodeUTF8_0(src, 0, src.length, false); - return new String(ret.value, ret.coder); - } - if (cs == ISO_8859_1) { - return newStringLatin1(src); - } - if (cs == US_ASCII) { - if (isASCII(src)) { - return newStringLatin1(src); - } else { - throwMalformed(src); - } - } - - CharsetDecoder cd = cs.newDecoder(); - // ascii fastpath - if ((cd instanceof ArrayDecoder) && - ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) { - return newStringLatin1(src); - } - int len = src.length; - if (len == 0) { - return ""; - } - int en = scale(len, cd.maxCharsPerByte()); - char[] ca = new char[en]; - if (cs.getClass().getClassLoader0() != null && - System.getSecurityManager() != null) { - src = Arrays.copyOf(src, len); - } - ByteBuffer bb = ByteBuffer.wrap(src); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new IllegalArgumentException(x); // todo - } - Result ret = resultCached.get().with(ca, 0, cb.position()); - return new String(ret.value, ret.coder); - } - /* * Throws CCE, instead of replacing, if unmappable. */ diff --git a/src/java.base/share/classes/java/lang/System.java b/src/java.base/share/classes/java/lang/System.java index 7d94e041cc3e1..91fbadc4d1bfa 100644 --- a/src/java.base/share/classes/java/lang/System.java +++ b/src/java.base/share/classes/java/lang/System.java @@ -2262,7 +2262,7 @@ public Stream layers(ClassLoader loader) { } public String newStringNoRepl(byte[] bytes, Charset cs) throws CharacterCodingException { - return StringCoding.newStringNoRepl(bytes, cs); + return String.newStringNoRepl(bytes, cs); } public byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { @@ -2270,7 +2270,7 @@ public byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingExcepti } public String newStringUTF8NoRepl(byte[] bytes, int off, int len) { - return StringCoding.newStringUTF8NoRepl(bytes, off, len); + return String.newStringUTF8NoRepl(bytes, off, len); } public byte[] getBytesUTF8NoRepl(String s) { From 67067dbcb4598a3d01ecf609b39ddf1d4af7e3d6 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 17:51:49 +0100 Subject: [PATCH 02/25] Cleanups and small fixes --- .../share/classes/java/lang/String.java | 260 +++++++++--------- 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index ff95f419d8153..980cae12bae9a 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -529,6 +529,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; + return; } else { int sl = offset + length; int dp = 0; @@ -709,19 +710,19 @@ public String(byte[] bytes, int offset, int length, Charset charset) { // ascii fastpath if ((cd instanceof ArrayDecoder) && ((ArrayDecoder)cd).isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { - if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + if (COMPACT_STRINGS) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; - } else { - byte[] dst = new byte[length << 1]; - int dp = 0; - while (dp < length) { - int b = bytes[offset++]; - putChar(dst, dp++, (b >= 0) ? (char) b : REPL); - } - this.value = dst; - this.coder = UTF16; + return; + } + byte[] dst = new byte[length << 1]; + int dp = 0; + while (dp < length) { + int b = bytes[offset++]; + putChar(dst, dp++, (b >= 0) ? (char) b : REPL); } + this.value = dst; + this.coder = UTF16; return; } // fastpath for always Latin1 decodable single byte @@ -733,12 +734,12 @@ public String(byte[] bytes, int offset, int length, Charset charset) { return; } - int en = StringCoding.scale(length, cd.maxCharsPerByte()); if (length == 0) { this.value = "".value; this.coder = "".coder; return; } + int en = StringCoding.scale(length, cd.maxCharsPerByte()); cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); @@ -816,11 +817,15 @@ static String newStringNoRepl1(byte[] src, Charset cs) { return newStringUTF8NoRepl(src, 0, src.length); } if (cs == ISO_8859_1) { - return new String(src, 0, src.length, ISO_8859_1); + if (COMPACT_STRINGS) + return new String(src, LATIN1); + return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } if (cs == US_ASCII) { if (!StringCoding.hasNegatives(src, 0, src.length)) { - return new String(src, 0, src.length, ISO_8859_1); + if (COMPACT_STRINGS) + return new String(src, LATIN1); + return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } else { StringCoding.throwMalformed(src); } @@ -864,137 +869,132 @@ static String newStringNoRepl1(byte[] src, Charset cs) { */ private String(byte[] bytes, int offset, int length, Void throwOnError) { checkBoundsOffCount(offset, length, bytes.length); - if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { - this.value = Arrays.copyOfRange(bytes, offset, offset + length); - this.coder = LATIN1; - } else { - int sl = offset + length; - int dp = 0; - byte[] dst = new byte[length]; - if (COMPACT_STRINGS) { - while (offset < sl) { - int b1 = bytes[offset]; - if (b1 >= 0) { - dst[dp++] = (byte)b1; - offset++; - continue; - } - if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && - offset + 1 < sl) { - int b2 = bytes[offset + 1]; - if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); - offset += 2; - continue; - } - } - // anything not a latin1, including the repl - // we have to go with the utf16 - break; + int sl = offset + length; + int dp = 0; + byte[] dst = new byte[length]; + if (COMPACT_STRINGS) { + while (offset < sl) { + int b1 = bytes[offset]; + if (b1 >= 0) { + dst[dp++] = (byte)b1; + offset++; + continue; } - if (offset == sl) { - if (dp != dst.length) { - dst = Arrays.copyOf(dst, dp); + if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && + offset + 1 < sl) { + int b2 = bytes[offset + 1]; + if (!StringCoding.isNotContinuation(b2)) { + dst[dp++] = (byte)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + offset += 2; + continue; } - this.value = dst; - this.coder = LATIN1; - return; } + // anything not a latin1, including the repl + // we have to go with the utf16 + break; } - if (dp == 0) { - dst = new byte[length << 1]; - } else { - byte[] buf = new byte[length << 1]; - StringLatin1.inflate(dst, 0, buf, 0, dp); - dst = buf; + if (offset == sl) { + if (dp != dst.length) { + dst = Arrays.copyOf(dst, dp); + } + this.value = dst; + this.coder = LATIN1; + return; } - while (offset < sl) { - int b1 = bytes[offset++]; - if (b1 >= 0) { - putChar(dst, dp++, (char) b1); - } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (offset < sl) { - int b2 = bytes[offset++]; - if (StringCoding.isNotContinuation(b2)) { - StringCoding.throwMalformed(offset - 1, 1); - } else { - putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0)))); - } - continue; + } + if (dp == 0) { + dst = new byte[length << 1]; + } else { + byte[] buf = new byte[length << 1]; + StringLatin1.inflate(dst, 0, buf, 0, dp); + dst = buf; + } + while (offset < sl) { + int b1 = bytes[offset++]; + if (b1 >= 0) { + putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (offset < sl) { + int b2 = bytes[offset++]; + if (StringCoding.isNotContinuation(b2)) { + StringCoding.throwMalformed(offset - 1, 1); + } else { + putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0)))); } - StringCoding.throwMalformed(offset, 1); // underflow() - break; - } else if ((b1 >> 4) == -2) { - if (offset + 1 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - if (StringCoding.isMalformed3(b1, b2, b3)) { + continue; + } + StringCoding.throwMalformed(offset, 1); // underflow() + break; + } else if ((b1 >> 4) == -2) { + if (offset + 1 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + if (StringCoding.isMalformed3(b1, b2, b3)) { + StringCoding.throwMalformed(offset - 3, 3); + } else { + char c = (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (isSurrogate(c)) { StringCoding.throwMalformed(offset - 3, 3); } else { - char c = (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (isSurrogate(c)) { - StringCoding.throwMalformed(offset - 3, 3); - } else { - putChar(dst, dp++, c); - } + putChar(dst, dp++, c); } - continue; } - if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { - StringCoding.throwMalformed(offset - 1, 2); - continue; - } - StringCoding.throwMalformed(offset, 1); - break; - } else if ((b1 >> 3) == -2) { - if (offset + 2 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - int b4 = bytes[offset++]; - int uc = ((b1 << 18) ^ - (b2 << 12) ^ - (b3 << 6) ^ - (b4 ^ - (((byte) 0xF0 << 18) ^ - ((byte) 0x80 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (StringCoding.isMalformed4(b2, b3, b4) || - !isSupplementaryCodePoint(uc)) { // shortest form check - StringCoding.throwMalformed(offset - 4, 4); - } else { - putChar(dst, dp++, highSurrogate(uc)); - putChar(dst, dp++, lowSurrogate(uc)); - } - continue; - } - b1 &= 0xff; - if (b1 > 0xf4 || - offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { - StringCoding.throwMalformed(offset - 1, 1); // or 2 - continue; + continue; + } + if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { + StringCoding.throwMalformed(offset - 1, 2); + continue; + } + StringCoding.throwMalformed(offset, 1); + break; + } else if ((b1 >> 3) == -2) { + if (offset + 2 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + int b4 = bytes[offset++]; + int uc = ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (StringCoding.isMalformed4(b2, b3, b4) || + !isSupplementaryCodePoint(uc)) { // shortest form check + StringCoding.throwMalformed(offset - 4, 4); + } else { + putChar(dst, dp++, highSurrogate(uc)); + putChar(dst, dp++, lowSurrogate(uc)); } - StringCoding.throwMalformed(offset - 1, 1); - break; - } else { - StringCoding.throwMalformed(offset - 1, 1); + continue; } + b1 &= 0xff; + if (b1 > 0xf4 || + offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { + StringCoding.throwMalformed(offset - 1, 1); // or 2 + continue; + } + StringCoding.throwMalformed(offset - 1, 1); + break; + } else { + StringCoding.throwMalformed(offset - 1, 1); } - if (dp != length) { - dst = Arrays.copyOf(dst, dp << 1); - } - this.value = dst; - this.coder = UTF16; } + if (dp != length) { + dst = Arrays.copyOf(dst, dp << 1); + } + this.value = dst; + this.coder = UTF16; } /** From 6335b3bb0d7d8b63debbf06f21b7c90d867731ec Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 18:34:09 +0100 Subject: [PATCH 03/25] Break bootstrap order issues by statically importing Charsets from StringCoding rather than eagerly initialize on String clinit --- src/java.base/share/classes/java/lang/String.java | 7 +++---- src/java.base/share/classes/java/lang/StringCoding.java | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 980cae12bae9a..06f6cc57a5652 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -57,6 +57,9 @@ import static java.lang.Character.*; import static java.lang.Character.lowSurrogate; +import static java.lang.StringCoding.ISO_8859_1; +import static java.lang.StringCoding.US_ASCII; +import static java.lang.StringCoding.UTF_8; import static java.lang.StringUTF16.putChar; import static java.util.function.Predicate.not; @@ -223,10 +226,6 @@ public final class String COMPACT_STRINGS = true; } - private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; - private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; - private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; - private static final char REPL = '\ufffd'; /** diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 051404bf6f3f7..827d46ae22848 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -66,9 +66,9 @@ private StringCoding() { } private static final ThreadLocal> encoder = new ThreadLocal<>(); - private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; - private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; - private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; + static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; + static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; + static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; private static T deref(ThreadLocal> tl) { SoftReference sr = tl.get(); From a99079a8071d1236265ec34d21d65b5e93e3cfdf Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 18:51:49 +0100 Subject: [PATCH 04/25] Add simple StringDecode micro --- .../openjdk/bench/java/lang/StringDecode.java | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 test/micro/org/openjdk/bench/java/lang/StringDecode.java diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java new file mode 100644 index 0000000000000..d9fa17c14e733 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(value = 3) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 2) +@State(Scope.Thread) +public class StringDecode { + + @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "UTF_16"}) + private String charsetName; + + private Charset charset; + + private byte[] asciiString; + private byte[] utf16String; + + private byte[] asciiDefaultString; + private byte[] utf16DefaultString; + @Setup + public void setup() { + charset = Charset.forName(charsetName); + asciiString = "ascii string".getBytes(charset); + utf16String = "UTF-\uFF11\uFF16 string".getBytes(charset); + + asciiDefaultString = "ascii string".getBytes(); + utf16DefaultString = "UTF-\uFF11\uFF16 string".getBytes(); + } + + @Benchmark + public String decodeCharsetName(Blackhole bh) throws Exception { + bh.consume(new String(asciiString, charsetName)); + bh.consume(new String(utf16String, charsetName)); + } + + @Benchmark + public String decodeCharset(Blackhole bh) throws Exception { + bh.consume(new String(asciiString, charset)); + bh.consume(new String(utf16String, charset)); + } + + @Benchmark + public String decodeDefault(Blackhole bh) throws Exception { + bh.consume(new String(asciiDefaultString, charset)); + bh.consume(new String(utf16DefaultString, charset)); + } +} From 1a9797c57a0573931a01c346d0abd39f004f3a16 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 18:55:33 +0100 Subject: [PATCH 05/25] Fix micro --- .../org/openjdk/bench/java/lang/StringDecode.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java index d9fa17c14e733..01dcfc3e09958 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringDecode.java +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -24,12 +24,16 @@ import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; import java.util.concurrent.TimeUnit; @@ -62,19 +66,19 @@ public void setup() { } @Benchmark - public String decodeCharsetName(Blackhole bh) throws Exception { + public void decodeCharsetName(Blackhole bh) throws Exception { bh.consume(new String(asciiString, charsetName)); bh.consume(new String(utf16String, charsetName)); } @Benchmark - public String decodeCharset(Blackhole bh) throws Exception { + public void decodeCharset(Blackhole bh) throws Exception { bh.consume(new String(asciiString, charset)); bh.consume(new String(utf16String, charset)); } @Benchmark - public String decodeDefault(Blackhole bh) throws Exception { + public void decodeDefault(Blackhole bh) throws Exception { bh.consume(new String(asciiDefaultString, charset)); bh.consume(new String(utf16DefaultString, charset)); } From f3b7f74e1e7e5af5f9d28ab8ebdfceb5ba2d8599 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 14 Jan 2021 19:18:07 +0100 Subject: [PATCH 06/25] More micro fixes --- test/micro/org/openjdk/bench/java/lang/StringDecode.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java index 01dcfc3e09958..fc048272ba66c 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringDecode.java +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -35,6 +35,7 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; +import java.nio.charset.Charset; import java.util.concurrent.TimeUnit; @BenchmarkMode(Mode.AverageTime) @@ -55,6 +56,7 @@ public class StringDecode { private byte[] asciiDefaultString; private byte[] utf16DefaultString; + @Setup public void setup() { charset = Charset.forName(charsetName); From 14b142bc12b4622c53d3768d1b7dc2dc573c4c4e Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 01:44:16 +0100 Subject: [PATCH 07/25] Cleanups, minor improvements --- .../share/classes/java/lang/String.java | 17 +++-- .../share/classes/java/lang/StringCoding.java | 25 -------- .../openjdk/bench/java/lang/StringDecode.java | 63 +++++++++++-------- 3 files changed, 45 insertions(+), 60 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 06f6cc57a5652..eb962b1dcf7fa 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -714,13 +714,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { this.coder = LATIN1; return; } - byte[] dst = new byte[length << 1]; - int dp = 0; - while (dp < length) { - int b = bytes[offset++]; - putChar(dst, dp++, (b >= 0) ? (char) b : REPL); - } - this.value = dst; + this.value = StringLatin1.inflate(bytes, offset, length); this.coder = UTF16; return; } @@ -858,8 +852,13 @@ static String newStringNoRepl1(byte[] src, Charset cs) { } catch (CharacterCodingException x) { throw new IllegalArgumentException(x); } - StringCoding.Result ret = new StringCoding.Result().with(ca, 0, cb.position()); - return new String(ret.value, ret.coder); + if (COMPACT_STRINGS) { + byte[] bs = StringUTF16.compress(ca, 0, cb.position()); + if (bs != null) { + return new String(bs, LATIN1); + } + } + return new String(StringUTF16.toBytes(ca, 0, cb.position()), UTF16); } /* diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 67559a1958ad1..0661ea155d116 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -106,31 +106,6 @@ static Charset lookupCharset(String csn) { return null; } - static class Result { - byte[] value; - byte coder; - - Result with(char[] val, int off, int len) { - if (String.COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(val, off, len); - if (bs != null) { - value = bs; - coder = LATIN1; - return this; - } - } - coder = UTF16; - value = StringUTF16.toBytes(val, off, len); - return this; - } - - Result with(byte[] val, byte coder) { - this.coder = coder; - value = val; - return this; - } - } - @IntrinsicCandidate public static boolean hasNegatives(byte[] ba, int off, int len) { for (int i = off; i < off + len; i++) { diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java index fc048272ba66c..26235131aec3d 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringDecode.java +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -40,48 +40,59 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) -@Fork(value = 3) -@Warmup(iterations = 5, time = 1) -@Measurement(iterations = 5, time = 2) +@Fork(value = 3, jvmArgs = "-Xmx1g") +@Warmup(iterations = 5, time = 2) +@Measurement(iterations = 5, time = 3) @State(Scope.Thread) public class StringDecode { - @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "UTF_16"}) - private String charsetName; + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Fork(value = 3, jvmArgs = "-Xmx1g") + @Warmup(iterations = 5, time = 2) + @Measurement(iterations = 5, time = 2) + @State(Scope.Thread) + public static class WithCharset { - private Charset charset; + @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "UTF_16"}) + private String charsetName; - private byte[] asciiString; - private byte[] utf16String; + private Charset charset; + private byte[] asciiString; + private byte[] utf16String; + + @Setup + public void setup() { + charset = Charset.forName(charsetName); + asciiString = "ascii string".getBytes(charset); + utf16String = "UTF-\uFF11\uFF16 string".getBytes(charset); + } + + @Benchmark + public void decodeCharsetName(Blackhole bh) throws Exception { + bh.consume(new String(asciiString, charsetName)); + bh.consume(new String(utf16String, charsetName)); + } + + @Benchmark + public void decodeCharset(Blackhole bh) throws Exception { + bh.consume(new String(asciiString, charset)); + bh.consume(new String(utf16String, charset)); + } + } private byte[] asciiDefaultString; private byte[] utf16DefaultString; @Setup public void setup() { - charset = Charset.forName(charsetName); - asciiString = "ascii string".getBytes(charset); - utf16String = "UTF-\uFF11\uFF16 string".getBytes(charset); - asciiDefaultString = "ascii string".getBytes(); utf16DefaultString = "UTF-\uFF11\uFF16 string".getBytes(); } - @Benchmark - public void decodeCharsetName(Blackhole bh) throws Exception { - bh.consume(new String(asciiString, charsetName)); - bh.consume(new String(utf16String, charsetName)); - } - - @Benchmark - public void decodeCharset(Blackhole bh) throws Exception { - bh.consume(new String(asciiString, charset)); - bh.consume(new String(utf16String, charset)); - } - @Benchmark public void decodeDefault(Blackhole bh) throws Exception { - bh.consume(new String(asciiDefaultString, charset)); - bh.consume(new String(utf16DefaultString, charset)); + bh.consume(new String(asciiDefaultString)); + bh.consume(new String(utf16DefaultString)); } } From d336ac432b13d18ceedb0a4f7a9a0adcbb1e0599 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 03:19:38 +0100 Subject: [PATCH 08/25] Cleanup includes etc --- .../share/classes/java/lang/String.java | 139 ++++++++++-------- .../openjdk/bench/java/lang/StringDecode.java | 2 +- 2 files changed, 79 insertions(+), 62 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index eb962b1dcf7fa..0230f639a7c41 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -33,7 +33,12 @@ import java.lang.constant.ConstantDesc; import java.nio.ByteBuffer; import java.nio.CharBuffer; -import java.nio.charset.*; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.MalformedInputException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -55,8 +60,6 @@ import jdk.internal.vm.annotation.Stable; import sun.nio.cs.ArrayDecoder; -import static java.lang.Character.*; -import static java.lang.Character.lowSurrogate; import static java.lang.StringCoding.ISO_8859_1; import static java.lang.StringCoding.US_ASCII; import static java.lang.StringCoding.UTF_8; @@ -524,6 +527,11 @@ public String(byte[] bytes, int offset, int length, String charsetName) public String(byte[] bytes, int offset, int length, Charset charset) { Objects.requireNonNull(charset); checkBoundsOffCount(offset, length, bytes.length); + if (length == 0) { + this.value = "".value; + this.coder = "".coder; + return; + } if (charset == UTF_8) { if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); @@ -607,7 +615,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); - if (isSurrogate(c)) { + if (Character.isSurrogate(c)) { putChar(dst, dp++, REPL); } else { putChar(dst, dp++, c); @@ -635,13 +643,13 @@ public String(byte[] bytes, int offset, int length, Charset charset) { ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (StringCoding.isMalformed4(b2, b3, b4) || - !isSupplementaryCodePoint(uc)) { // shortest form check + !Character.isSupplementaryCodePoint(uc)) { // shortest form check putChar(dst, dp++, REPL); offset -= 4; offset += StringCoding.malformedN(bytes, offset, 4); } else { - putChar(dst, dp++, highSurrogate(uc)); - putChar(dst, dp++, lowSurrogate(uc)); + putChar(dst, dp++, Character.highSurrogate(uc)); + putChar(dst, dp++, Character.lowSurrogate(uc)); } continue; } @@ -706,39 +714,35 @@ public String(byte[] bytes, int offset, int length, Charset charset) { // but it then can be argued that the SM is null when the operation // is started... CharsetDecoder cd = charset.newDecoder(); - // ascii fastpath - if ((cd instanceof ArrayDecoder) && - ((ArrayDecoder)cd).isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { - if (COMPACT_STRINGS) { - this.value = Arrays.copyOfRange(bytes, offset, offset + length); + // ArrayDecoder fastpaths + if (cd instanceof ArrayDecoder ad) { + // ascii + if (ad.isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { + if (COMPACT_STRINGS) { + this.value = Arrays.copyOfRange(bytes, offset, offset + length); + this.coder = LATIN1; + return; + } + this.value = StringLatin1.inflate(bytes, offset, length); + this.coder = UTF16; + return; + } + + // fastpath for always Latin1 decodable single byte + if (COMPACT_STRINGS && ad.isLatin1Decodable()) { + byte[] dst = new byte[length]; + ad.decodeToLatin1(bytes, offset, length, dst); + this.value = dst; this.coder = LATIN1; return; } - this.value = StringLatin1.inflate(bytes, offset, length); - this.coder = UTF16; - return; - } - // fastpath for always Latin1 decodable single byte - if (COMPACT_STRINGS && cd instanceof ArrayDecoder && ((ArrayDecoder)cd).isLatin1Decodable()) { - byte[] dst = new byte[length]; - ((ArrayDecoder)cd).decodeToLatin1(bytes, offset, length, dst); - this.value = dst; - this.coder = LATIN1; - return; - } - if (length == 0) { - this.value = "".value; - this.coder = "".coder; - return; - } - int en = StringCoding.scale(length, cd.maxCharsPerByte()); - cd.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE) - .reset(); - char[] ca = new char[en]; - if (cd instanceof ArrayDecoder) { - int clen = ((ArrayDecoder)cd).decode(bytes, offset, length, ca); + int en = StringCoding.scale(length, cd.maxCharsPerByte()); + cd.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .reset(); + char[] ca = new char[en]; + int clen = ad.decode(bytes, offset, length, ca); if (COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(ca, 0, clen); if (bs != null) { @@ -751,27 +755,22 @@ public String(byte[] bytes, int offset, int length, Charset charset) { value = StringUTF16.toBytes(ca, 0, clen); return; } + + // decode using CharsetDecoder + int en = StringCoding.scale(length, cd.maxCharsPerByte()); + cd.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .reset(); + char[] ca = new char[en]; if (charset.getClass().getClassLoader0() != null && System.getSecurityManager() != null) { bytes = Arrays.copyOfRange(bytes, offset, offset + length); offset = 0; } - ByteBuffer bb = ByteBuffer.wrap(bytes, offset, length); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } + + int caLen = decode(cd, ca, bytes, offset, length); if (COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(ca, 0, cb.position()); + byte[] bs = StringUTF16.compress(ca, 0, caLen); if (bs != null) { value = bs; coder = LATIN1; @@ -779,8 +778,26 @@ public String(byte[] bytes, int offset, int length, Charset charset) { } } coder = UTF16; - value = StringUTF16.toBytes(ca, 0, cb.position()); + value = StringUTF16.toBytes(ca, 0, caLen); + } + } + + private static int decode(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) { + ByteBuffer bb = ByteBuffer.wrap(src, offset, length); + CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); } + return cb.position(); } ////////////////////// for j.u.z.ZipCoder ////////////////////////// @@ -798,8 +815,8 @@ static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingExce } catch (IllegalArgumentException e) { //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause Throwable cause = e.getCause(); - if (cause instanceof MalformedInputException) { - throw (MalformedInputException)cause; + if (cause instanceof MalformedInputException mie) { + throw mie; } throw (CharacterCodingException)cause; } @@ -826,8 +843,8 @@ static String newStringNoRepl1(byte[] src, Charset cs) { CharsetDecoder cd = cs.newDecoder(); // ascii fastpath - if ((cd instanceof ArrayDecoder) && - ((ArrayDecoder)cd).isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { + if (cd instanceof ArrayDecoder ad && + ad.isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { return new String(src, 0, src.length, ISO_8859_1); } int len = src.length; @@ -940,7 +957,7 @@ private String(byte[] bytes, int offset, int length, Void throwOnError) { (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); - if (isSurrogate(c)) { + if (Character.isSurrogate(c)) { StringCoding.throwMalformed(offset - 3, 3); } else { putChar(dst, dp++, c); @@ -968,11 +985,11 @@ private String(byte[] bytes, int offset, int length, Void throwOnError) { ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (StringCoding.isMalformed4(b2, b3, b4) || - !isSupplementaryCodePoint(uc)) { // shortest form check + !Character.isSupplementaryCodePoint(uc)) { // shortest form check StringCoding.throwMalformed(offset - 4, 4); } else { - putChar(dst, dp++, highSurrogate(uc)); - putChar(dst, dp++, lowSurrogate(uc)); + putChar(dst, dp++, Character.highSurrogate(uc)); + putChar(dst, dp++, Character.lowSurrogate(uc)); } continue; } @@ -1073,7 +1090,7 @@ public String(byte bytes[], Charset charset) { * * @since 1.1 */ - public String(byte bytes[], int offset, int length) { + public String(byte[] bytes, int offset, int length) { this(bytes, offset, length, Charset.defaultCharset()); } diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java index 26235131aec3d..04ffdf2a84e6a 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringDecode.java +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -54,7 +54,7 @@ public class StringDecode { @State(Scope.Thread) public static class WithCharset { - @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "UTF_16"}) + @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "MS932", "ISO_8859_6"}) private String charsetName; private Charset charset; From a5f610ee3897c10c5ad69f303fd011435c9724be Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 14:48:50 +0100 Subject: [PATCH 09/25] Refactor charset lookups to ensure expected exceptions are thrown on null, foo etc --- .../share/classes/java/lang/String.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 0230f639a7c41..ee484735c805d 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -33,12 +33,7 @@ import java.lang.constant.ConstantDesc; import java.nio.ByteBuffer; import java.nio.CharBuffer; -import java.nio.charset.CharacterCodingException; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; -import java.nio.charset.MalformedInputException; +import java.nio.charset.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -491,7 +486,21 @@ public String(byte ascii[], int hibyte) { */ public String(byte[] bytes, int offset, int length, String charsetName) throws UnsupportedEncodingException { - this(bytes, offset, length, StringCoding.lookupCharset(Objects.requireNonNull(charsetName))); + this(bytes, offset, length, lookupCharset(charsetName)); + } + + private static Charset lookupCharset(String charsetName) + throws UnsupportedEncodingException { + Objects.requireNonNull(charsetName); + try { + Charset cs = StringCoding.lookupCharset(charsetName); + if (cs == null) { + throw new UnsupportedEncodingException(charsetName); + } + return cs; + } catch (IllegalCharsetNameException ics) { + throw new UnsupportedEncodingException(charsetName); + } } /** From dfce627bf394d656ea277a48d2099e66c1b9fef0 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 15:16:20 +0100 Subject: [PATCH 10/25] Fix charset names in micro settings --- src/java.base/share/classes/java/lang/String.java | 6 ++---- test/micro/org/openjdk/bench/java/lang/StringDecode.java | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index ee484735c805d..658eb2ccc62df 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -748,8 +748,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { int en = StringCoding.scale(length, cd.maxCharsPerByte()); cd.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE) - .reset(); + .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = new char[en]; int clen = ad.decode(bytes, offset, length, ca); if (COMPACT_STRINGS) { @@ -768,8 +767,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { // decode using CharsetDecoder int en = StringCoding.scale(length, cd.maxCharsPerByte()); cd.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE) - .reset(); + .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = new char[en]; if (charset.getClass().getClassLoader0() != null && System.getSecurityManager() != null) { diff --git a/test/micro/org/openjdk/bench/java/lang/StringDecode.java b/test/micro/org/openjdk/bench/java/lang/StringDecode.java index 04ffdf2a84e6a..22cc920ead50a 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringDecode.java +++ b/test/micro/org/openjdk/bench/java/lang/StringDecode.java @@ -54,7 +54,7 @@ public class StringDecode { @State(Scope.Thread) public static class WithCharset { - @Param({"US-ASCII", "ISO_8859_1", "UTF-8", "MS932", "ISO_8859_6"}) + @Param({"US-ASCII", "ISO-8859-1", "UTF-8", "MS932", "ISO-8859-6"}) private String charsetName; private Charset charset; From f14826b11ddfc42c10dbba427edc53aa97e6ed34 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 15:43:27 +0100 Subject: [PATCH 11/25] Remove unused imports --- src/java.base/share/classes/java/lang/StringCoding.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 0661ea155d116..275b70b0aba62 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -30,7 +30,6 @@ import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CharacterCodingException; import java.nio.charset.CoderResult; @@ -42,17 +41,10 @@ import java.util.Arrays; import jdk.internal.vm.annotation.IntrinsicCandidate; import sun.nio.cs.HistoricallyNamedCharset; -import sun.nio.cs.ArrayDecoder; import sun.nio.cs.ArrayEncoder; import static java.lang.String.LATIN1; import static java.lang.String.UTF16; -import static java.lang.String.COMPACT_STRINGS; -import static java.lang.Character.isSurrogate; -import static java.lang.Character.highSurrogate; -import static java.lang.Character.lowSurrogate; -import static java.lang.Character.isSupplementaryCodePoint; -import static java.lang.StringUTF16.putChar; /** * Utility class for string encoding and decoding. From c8899e1539eead1f101a23cb28bff8d292f82529 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 15 Jan 2021 22:56:56 +0100 Subject: [PATCH 12/25] Outline much of the decode logic back into StringCoding --- .../share/classes/java/lang/String.java | 184 +----------------- .../share/classes/java/lang/StringCoding.java | 137 +++++++++++++ 2 files changed, 143 insertions(+), 178 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 658eb2ccc62df..2117d509e63bc 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -58,7 +58,6 @@ import static java.lang.StringCoding.ISO_8859_1; import static java.lang.StringCoding.US_ASCII; import static java.lang.StringCoding.UTF_8; -import static java.lang.StringUTF16.putChar; import static java.util.function.Predicate.not; /** @@ -224,8 +223,6 @@ public final class String COMPACT_STRINGS = true; } - private static final char REPL = '\ufffd'; - /** * Class String is special cased within the Serialization Stream Protocol. * @@ -563,9 +560,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { offset + 1 < sl) { int b2 = bytes[offset + 1]; if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); + dst[dp++] = (byte)StringCoding.decode2(b1, b2); offset += 2; continue; } @@ -590,94 +585,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } - while (offset < sl) { - int b1 = bytes[offset++]; - if (b1 >= 0) { - putChar(dst, dp++, (char) b1); - } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (offset < sl) { - int b2 = bytes[offset++]; - if (StringCoding.isNotContinuation(b2)) { - putChar(dst, dp++, REPL); - offset--; - } else { - putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0)))); - } - continue; - } - putChar(dst, dp++, REPL); - break; - } else if ((b1 >> 4) == -2) { - if (offset + 1 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - if (StringCoding.isMalformed3(b1, b2, b3)) { - putChar(dst, dp++, REPL); - offset -= 3; - offset += StringCoding.malformedN(bytes, offset, 3); - } else { - char c = (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (Character.isSurrogate(c)) { - putChar(dst, dp++, REPL); - } else { - putChar(dst, dp++, c); - } - } - continue; - } - if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { - putChar(dst, dp++, REPL); - continue; - } - putChar(dst, dp++, REPL); - break; - } else if ((b1 >> 3) == -2) { - if (offset + 2 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - int b4 = bytes[offset++]; - int uc = ((b1 << 18) ^ - (b2 << 12) ^ - (b3 << 6) ^ - (b4 ^ - (((byte) 0xF0 << 18) ^ - ((byte) 0x80 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (StringCoding.isMalformed4(b2, b3, b4) || - !Character.isSupplementaryCodePoint(uc)) { // shortest form check - putChar(dst, dp++, REPL); - offset -= 4; - offset += StringCoding.malformedN(bytes, offset, 4); - } else { - putChar(dst, dp++, Character.highSurrogate(uc)); - putChar(dst, dp++, Character.lowSurrogate(uc)); - } - continue; - } - b1 &= 0xff; - if (b1 > 0xf4 || - offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { - putChar(dst, dp++, REPL); - continue; - } - offset++; - putChar(dst, dp++, REPL); - if (offset < sl && StringCoding.isMalformed4_3(bytes[offset])) { - continue; - } - break; - } else { - putChar(dst, dp++, REPL); - } - } + dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); } @@ -701,7 +609,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { int dp = 0; while (dp < length) { int b = bytes[offset++]; - putChar(dst, dp++, (b >= 0) ? (char) b : REPL); + StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : StringCoding.REPL); } this.value = dst; this.coder = UTF16; @@ -906,14 +814,12 @@ private String(byte[] bytes, int offset, int length, Void throwOnError) { offset + 1 < sl) { int b2 = bytes[offset + 1]; if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); + dst[dp++] = (byte)StringCoding.decode2(b1, b2); offset += 2; continue; } } - // anything not a latin1, including the repl + // anything not a latin1, including the REPL // we have to go with the utf16 break; } @@ -933,85 +839,7 @@ private String(byte[] bytes, int offset, int length, Void throwOnError) { StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } - while (offset < sl) { - int b1 = bytes[offset++]; - if (b1 >= 0) { - putChar(dst, dp++, (char) b1); - } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (offset < sl) { - int b2 = bytes[offset++]; - if (StringCoding.isNotContinuation(b2)) { - StringCoding.throwMalformed(offset - 1, 1); - } else { - putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0)))); - } - continue; - } - StringCoding.throwMalformed(offset, 1); // underflow() - break; - } else if ((b1 >> 4) == -2) { - if (offset + 1 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - if (StringCoding.isMalformed3(b1, b2, b3)) { - StringCoding.throwMalformed(offset - 3, 3); - } else { - char c = (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (Character.isSurrogate(c)) { - StringCoding.throwMalformed(offset - 3, 3); - } else { - putChar(dst, dp++, c); - } - } - continue; - } - if (offset < sl && StringCoding.isMalformed3_2(b1, bytes[offset])) { - StringCoding.throwMalformed(offset - 1, 2); - continue; - } - StringCoding.throwMalformed(offset, 1); - break; - } else if ((b1 >> 3) == -2) { - if (offset + 2 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - int b4 = bytes[offset++]; - int uc = ((b1 << 18) ^ - (b2 << 12) ^ - (b3 << 6) ^ - (b4 ^ - (((byte) 0xF0 << 18) ^ - ((byte) 0x80 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - if (StringCoding.isMalformed4(b2, b3, b4) || - !Character.isSupplementaryCodePoint(uc)) { // shortest form check - StringCoding.throwMalformed(offset - 4, 4); - } else { - putChar(dst, dp++, Character.highSurrogate(uc)); - putChar(dst, dp++, Character.lowSurrogate(uc)); - } - continue; - } - b1 &= 0xff; - if (b1 > 0xf4 || - offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { - StringCoding.throwMalformed(offset - 1, 1); // or 2 - continue; - } - StringCoding.throwMalformed(offset - 1, 1); - break; - } else { - StringCoding.throwMalformed(offset - 1, 1); - } - } + dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); } diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 275b70b0aba62..6385d536cb2ff 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -45,6 +45,7 @@ import static java.lang.String.LATIN1; import static java.lang.String.UTF16; +import static java.lang.StringUTF16.putChar; /** * Utility class for string encoding and decoding. @@ -62,6 +63,8 @@ private StringCoding() { } static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; + static final char REPL = '\ufffd'; + private static T deref(ThreadLocal> tl) { SoftReference sr = tl.get(); if (sr == null) @@ -399,6 +402,140 @@ static boolean isMalformed4_3(int b3) { return (b3 & 0xc0) != 0x80; } + static char decode2(int b1, int b2) { + return (char)(((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + } + + static char decode3(int b1, int b2, int b3) { + return (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + } + + static int decode4(int b1, int b2, int b3, int b4) { + return ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + } + + static int decodeUTF8_UTF16(byte[] bytes, int offset, int sl, byte[] dst, int dp, boolean doReplace) { + while (offset < sl) { + int b1 = bytes[offset++]; + if (b1 >= 0) { + putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (offset < sl) { + int b2 = bytes[offset++]; + if (StringCoding.isNotContinuation(b2)) { + if (!doReplace) { + throwMalformed(offset - 1, 1); + } + putChar(dst, dp++, REPL); + offset--; + } else { + putChar(dst, dp++, decode2(b1, b2)); + } + continue; + } + if (!doReplace) { + throwMalformed(offset, 1); // underflow() + } + putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 4) == -2) { + if (offset + 1 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + if (isMalformed3(b1, b2, b3)) { + if (!doReplace) { + throwMalformed(offset - 3, 3); + } + putChar(dst, dp++, REPL); + offset -= 3; + offset += malformedN(bytes, offset, 3); + } else { + char c = decode3(b1, b2, b3); + if (Character.isSurrogate(c)) { + if (!doReplace) { + throwMalformed(offset - 3, 3); + } + putChar(dst, dp++, REPL); + } else { + putChar(dst, dp++, c); + } + } + continue; + } + if (offset < sl && isMalformed3_2(b1, bytes[offset])) { + if (!doReplace) { + throwMalformed(offset - 1, 2); + } + putChar(dst, dp++, REPL); + continue; + } + if (!doReplace) { + throwMalformed(offset, 1); + } + putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 3) == -2) { + if (offset + 2 < sl) { + int b2 = bytes[offset++]; + int b3 = bytes[offset++]; + int b4 = bytes[offset++]; + int uc = decode4(b1, b2, b3, b4); + if (isMalformed4(b2, b3, b4) || + !Character.isSupplementaryCodePoint(uc)) { // shortest form check + if (!doReplace) { + throwMalformed(offset - 4, 4); + } + putChar(dst, dp++, REPL); + offset -= 4; + offset += StringCoding.malformedN(bytes, offset, 4); + } else { + putChar(dst, dp++, Character.highSurrogate(uc)); + putChar(dst, dp++, Character.lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || + offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { + if (!doReplace) { + throwMalformed(offset - 1, 1); // or 2 + } + putChar(dst, dp++, REPL); + continue; + } + if (!doReplace) { + throwMalformed(offset - 1, 1); + } + offset++; + putChar(dst, dp++, REPL); + if (offset < sl && StringCoding.isMalformed4_3(bytes[offset])) { + continue; + } + break; + } else { + if (!doReplace) { + throwMalformed(offset - 1, 1); + } + putChar(dst, dp++, REPL); + } + } + return dp; + } + // for nb == 3/4 static int malformedN(byte[] src, int sp, int nb) { if (nb == 3) { From f328f4516541170c23ffc8d6966b7572c38c939d Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Sat, 16 Jan 2021 00:37:26 +0100 Subject: [PATCH 13/25] Copyrights --- src/java.base/share/classes/java/lang/String.java | 2 +- src/java.base/share/classes/java/lang/System.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 2117d509e63bc..7e4077d16fb59 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 1994, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1994, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/share/classes/java/lang/System.java b/src/java.base/share/classes/java/lang/System.java index 91fbadc4d1bfa..e536818fbf6e9 100644 --- a/src/java.base/share/classes/java/lang/System.java +++ b/src/java.base/share/classes/java/lang/System.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 1994, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1994, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From eb9c8507ab00ef57aed7e509ea81c5c3657401f7 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Sat, 16 Jan 2021 02:10:31 +0100 Subject: [PATCH 14/25] Further simplifications --- .../share/classes/java/lang/String.java | 140 ++++++------------ .../share/classes/java/lang/StringCoding.java | 18 +++ 2 files changed, 67 insertions(+), 91 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 7e4077d16fb59..3a6415fb938f4 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -683,7 +683,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { offset = 0; } - int caLen = decode(cd, ca, bytes, offset, length); + int caLen = StringCoding.decodeWithDecoder(cd, ca, bytes, offset, length); if (COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(ca, 0, caLen); if (bs != null) { @@ -697,31 +697,54 @@ public String(byte[] bytes, int offset, int length, Charset charset) { } } - private static int decode(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) { - ByteBuffer bb = ByteBuffer.wrap(src, offset, length); - CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return cb.position(); - } - - ////////////////////// for j.u.z.ZipCoder ////////////////////////// - /* * Throws iae, instead of replacing, if malformed or unmappable. */ - static String newStringUTF8NoRepl(byte[] src, int off, int len) { - return new String(src, off, len, (Void)null); + static String newStringUTF8NoRepl(byte[] bytes, int offset, int length) { + checkBoundsOffCount(offset, length, bytes.length); + int sl = offset + length; + int dp = 0; + byte[] dst = new byte[length]; + if (COMPACT_STRINGS) { + while (offset < sl) { + int b1 = bytes[offset]; + if (b1 >= 0) { + dst[dp++] = (byte)b1; + offset++; + continue; + } + if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && + offset + 1 < sl) { + int b2 = bytes[offset + 1]; + if (!StringCoding.isNotContinuation(b2)) { + dst[dp++] = (byte)StringCoding.decode2(b1, b2); + offset += 2; + continue; + } + } + // anything not a latin1, including the REPL + // we have to go with the utf16 + break; + } + if (offset == sl) { + if (dp != dst.length) { + dst = Arrays.copyOf(dst, dp); + } + return new String(dst, LATIN1); + } + } + if (dp == 0) { + dst = new byte[length << 1]; + } else { + byte[] buf = new byte[length << 1]; + StringLatin1.inflate(dst, 0, buf, 0, dp); + dst = buf; + } + dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); + if (dp != length) { + dst = Arrays.copyOf(dst, dp << 1); + } + return new String(dst, UTF16); } static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { @@ -772,79 +795,14 @@ static String newStringNoRepl1(byte[] src, Charset cs) { System.getSecurityManager() != null) { src = Arrays.copyOf(src, len); } - ByteBuffer bb = ByteBuffer.wrap(src); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new IllegalArgumentException(x); - } + int caLen = StringCoding.decodeWithDecoder(cd, ca, src, 0, src.length); if (COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(ca, 0, cb.position()); + byte[] bs = StringUTF16.compress(ca, 0, caLen); if (bs != null) { return new String(bs, LATIN1); } } - return new String(StringUTF16.toBytes(ca, 0, cb.position()), UTF16); - } - - /* - * Private constructor for doing UTF-8 decode, but throwing iae on malformed or - * unmappable characters - */ - private String(byte[] bytes, int offset, int length, Void throwOnError) { - checkBoundsOffCount(offset, length, bytes.length); - int sl = offset + length; - int dp = 0; - byte[] dst = new byte[length]; - if (COMPACT_STRINGS) { - while (offset < sl) { - int b1 = bytes[offset]; - if (b1 >= 0) { - dst[dp++] = (byte)b1; - offset++; - continue; - } - if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && - offset + 1 < sl) { - int b2 = bytes[offset + 1]; - if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)StringCoding.decode2(b1, b2); - offset += 2; - continue; - } - } - // anything not a latin1, including the REPL - // we have to go with the utf16 - break; - } - if (offset == sl) { - if (dp != dst.length) { - dst = Arrays.copyOf(dst, dp); - } - this.value = dst; - this.coder = LATIN1; - return; - } - } - if (dp == 0) { - dst = new byte[length << 1]; - } else { - byte[] buf = new byte[length << 1]; - StringLatin1.inflate(dst, 0, buf, 0, dp); - dst = buf; - } - dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); - if (dp != length) { - dst = Arrays.copyOf(dst, dp << 1); - } - this.value = dst; - this.coder = UTF16; + return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); } /** diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 6385d536cb2ff..f869852d536fb 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -536,6 +536,24 @@ static int decodeUTF8_UTF16(byte[] bytes, int offset, int sl, byte[] dst, int dp return dp; } + static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) { + ByteBuffer bb = ByteBuffer.wrap(src, offset, length); + CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + return cb.position(); + } + // for nb == 3/4 static int malformedN(byte[] src, int sp, int nb) { if (nb == 3) { From 4c8eacd1d78cec422ca236c0f1f6aafbe3bc11c5 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Sat, 16 Jan 2021 02:14:23 +0100 Subject: [PATCH 15/25] Add missing import (who needs IDEs?) --- src/java.base/share/classes/java/lang/StringCoding.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index f869852d536fb..9c0029939fe64 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -30,6 +30,7 @@ import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CharacterCodingException; import java.nio.charset.CoderResult; From 790e746342826ae18bcc169cbb9c7d024a592937 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Sun, 17 Jan 2021 13:03:47 +0100 Subject: [PATCH 16/25] Simplify lookupCharset --- .../share/classes/java/lang/String.java | 16 +------ .../share/classes/java/lang/StringCoding.java | 47 ++++++++----------- 2 files changed, 20 insertions(+), 43 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 3a6415fb938f4..c133b995b87ac 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -483,21 +483,7 @@ public String(byte ascii[], int hibyte) { */ public String(byte[] bytes, int offset, int length, String charsetName) throws UnsupportedEncodingException { - this(bytes, offset, length, lookupCharset(charsetName)); - } - - private static Charset lookupCharset(String charsetName) - throws UnsupportedEncodingException { - Objects.requireNonNull(charsetName); - try { - Charset cs = StringCoding.lookupCharset(charsetName); - if (cs == null) { - throw new UnsupportedEncodingException(charsetName); - } - return cs; - } catch (IllegalCharsetNameException ics) { - throw new UnsupportedEncodingException(charsetName); - } + this(bytes, offset, length, StringCoding.lookupCharset(charsetName)); } /** diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 9c0029939fe64..5602fd8e1c155 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -40,6 +40,8 @@ import java.nio.charset.UnmappableCharacterException; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; +import java.util.Objects; + import jdk.internal.vm.annotation.IntrinsicCandidate; import sun.nio.cs.HistoricallyNamedCharset; import sun.nio.cs.ArrayEncoder; @@ -91,15 +93,13 @@ static int scale(int len, float expansionFactor) { return (int)(len * (double)expansionFactor); } - static Charset lookupCharset(String csn) { - if (Charset.isSupported(csn)) { - try { - return Charset.forName(csn); - } catch (UnsupportedCharsetException x) { - throw new Error(x); - } + static Charset lookupCharset(String csn) throws UnsupportedEncodingException { + Objects.requireNonNull(csn); + try { + return Charset.forName(csn); + } catch (UnsupportedCharsetException | IllegalCharsetNameException x) { + throw new UnsupportedEncodingException(csn); } - return null; } @IntrinsicCandidate @@ -181,32 +181,23 @@ byte[] encode(byte coder, byte[] val) { } } - static byte[] encode(String charsetName, byte coder, byte[] val) + static byte[] encode(String csn, byte coder, byte[] val) throws UnsupportedEncodingException { StringEncoder se = deref(encoder); - String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((se == null) || !(csn.equals(se.requestedCharsetName()) || csn.equals(se.charsetName()))) { - se = null; - try { - Charset cs = lookupCharset(csn); - if (cs != null) { - if (cs == UTF_8) { - return encodeUTF8(coder, val, true); - } - if (cs == ISO_8859_1) { - return encode8859_1(coder, val); - } - if (cs == US_ASCII) { - return encodeASCII(coder, val); - } - se = new StringEncoder(cs, csn); - } - } catch (IllegalCharsetNameException x) {} - if (se == null) { - throw new UnsupportedEncodingException (csn); + Charset cs = lookupCharset(csn); + if (cs == UTF_8) { + return encodeUTF8(coder, val, true); + } + if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } + if (cs == US_ASCII) { + return encodeASCII(coder, val); } + se = new StringEncoder(cs, csn); set(encoder, se); } return se.encode(coder, val); From e870b3bb8b4a6ee45878078a0c2fb8bda8c9a140 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Sun, 17 Jan 2021 23:38:46 +0100 Subject: [PATCH 17/25] ASCII fast-path missing for UTF-8 NoRepl methods --- .../share/classes/java/lang/String.java | 85 ++++++++++--------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index c133b995b87ac..b96884f82cb99 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -532,9 +532,9 @@ public String(byte[] bytes, int offset, int length, Charset charset) { } else { int sl = offset + length; int dp = 0; - byte[] dst = new byte[length]; - + byte[] dst = null; if (COMPACT_STRINGS) { + dst = new byte[length]; while (offset < sl) { int b1 = bytes[offset]; if (b1 >= 0) { @@ -564,7 +564,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { return; } } - if (dp == 0) { + if (dp == 0 || dst == null) { dst = new byte[length << 1]; } else { byte[] buf = new byte[length << 1]; @@ -688,49 +688,54 @@ public String(byte[] bytes, int offset, int length, Charset charset) { */ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length) { checkBoundsOffCount(offset, length, bytes.length); - int sl = offset + length; - int dp = 0; - byte[] dst = new byte[length]; - if (COMPACT_STRINGS) { - while (offset < sl) { - int b1 = bytes[offset]; - if (b1 >= 0) { - dst[dp++] = (byte)b1; - offset++; - continue; - } - if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && - offset + 1 < sl) { - int b2 = bytes[offset + 1]; - if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)StringCoding.decode2(b1, b2); - offset += 2; + if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { + return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1); + } else { + int sl = offset + length; + int dp = 0; + byte[] dst = null; + if (COMPACT_STRINGS) { + dst = new byte[length]; + while (offset < sl) { + int b1 = bytes[offset]; + if (b1 >= 0) { + dst[dp++] = (byte) b1; + offset++; continue; } + if ((b1 == (byte) 0xc2 || b1 == (byte) 0xc3) && + offset + 1 < sl) { + int b2 = bytes[offset + 1]; + if (!StringCoding.isNotContinuation(b2)) { + dst[dp++] = (byte) StringCoding.decode2(b1, b2); + offset += 2; + continue; + } + } + // anything not a latin1, including the REPL + // we have to go with the utf16 + break; } - // anything not a latin1, including the REPL - // we have to go with the utf16 - break; - } - if (offset == sl) { - if (dp != dst.length) { - dst = Arrays.copyOf(dst, dp); + if (offset == sl) { + if (dp != dst.length) { + dst = Arrays.copyOf(dst, dp); + } + return new String(dst, LATIN1); } - return new String(dst, LATIN1); } + if (dp == 0 || dst == null) { + dst = new byte[length << 1]; + } else { + byte[] buf = new byte[length << 1]; + StringLatin1.inflate(dst, 0, buf, 0, dp); + dst = buf; + } + dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); + if (dp != length) { + dst = Arrays.copyOf(dst, dp << 1); + } + return new String(dst, UTF16); } - if (dp == 0) { - dst = new byte[length << 1]; - } else { - byte[] buf = new byte[length << 1]; - StringLatin1.inflate(dst, 0, buf, 0, dp); - dst = buf; - } - dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); - if (dp != length) { - dst = Arrays.copyOf(dst, dp << 1); - } - return new String(dst, UTF16); } static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { From ba279a7e9ea840c3e9539bbfe3aaccd93f0e2748 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Mon, 18 Jan 2021 10:22:43 +0100 Subject: [PATCH 18/25] Harmonize empty string checking in newString methods --- src/java.base/share/classes/java/lang/String.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index b96884f82cb99..b7289f4222859 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -688,6 +688,9 @@ public String(byte[] bytes, int offset, int length, Charset charset) { */ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length) { checkBoundsOffCount(offset, length, bytes.length); + if (length == 0) { + return ""; + } if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1); } else { @@ -752,6 +755,10 @@ static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingExce } static String newStringNoRepl1(byte[] src, Charset cs) { + int len = src.length; + if (len == 0) { + return ""; + } if (cs == UTF_8) { return newStringUTF8NoRepl(src, 0, src.length); } @@ -776,10 +783,6 @@ static String newStringNoRepl1(byte[] src, Charset cs) { ad.isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { return new String(src, 0, src.length, ISO_8859_1); } - int len = src.length; - if (len == 0) { - return ""; - } int en = StringCoding.scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; if (cs.getClass().getClassLoader0() != null && From c37aa600d54edf11a3f571b72693de8acaf3fecb Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Tue, 19 Jan 2021 22:06:03 +0100 Subject: [PATCH 19/25] More privates --- .../share/classes/java/lang/StringCoding.java | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 5602fd8e1c155..32d72b9823305 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -365,51 +365,52 @@ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { //////////////////////////////// utf8 //////////////////////////////////// + static boolean isNotContinuation(int b) { return (b & 0xc0) != 0x80; } - static boolean isMalformed3(int b1, int b2, int b3) { + private static boolean isMalformed3(int b1, int b2, int b3) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; } - static boolean isMalformed3_2(int b1, int b2) { + private static boolean isMalformed3_2(int b1, int b2) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80; } - static boolean isMalformed4(int b2, int b3, int b4) { + private static boolean isMalformed4(int b2, int b3, int b4) { return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80; } - static boolean isMalformed4_2(int b1, int b2) { + private static boolean isMalformed4_2(int b1, int b2) { return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80; } - static boolean isMalformed4_3(int b3) { + private static boolean isMalformed4_3(int b3) { return (b3 & 0xc0) != 0x80; } static char decode2(int b1, int b2) { - return (char)(((b1 << 6) ^ b2)^ + return (char)(((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); + ((byte) 0x80 << 0))); } - static char decode3(int b1, int b2, int b3) { + private static char decode3(int b1, int b2, int b3) { return (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); } - static int decode4(int b1, int b2, int b3, int b4) { + private static int decode4(int b1, int b2, int b3, int b4) { return ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ From 869bc109be2fa6b04b20dd7a3401f0ce6ab000f9 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 21 Jan 2021 12:36:07 +0100 Subject: [PATCH 20/25] More cleanups, make all things private that can be --- .../share/classes/java/lang/String.java | 9 +- .../share/classes/java/lang/StringCoding.java | 100 +++++++++--------- 2 files changed, 50 insertions(+), 59 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index b7289f4222859..7608452b8d1d7 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -609,13 +609,6 @@ public String(byte[] bytes, int offset, int length, Charset charset) { // (2)The defensive copy of the input byte/char[] has a big performance // impact, as well as the outgoing result byte/char[]. Need to do the // optimization check of (sm==null && classLoader0==null) for both. - // (3)There might be a timing gap in isTrusted setting. getClassLoader0() - // is only checked (and then isTrusted gets set) when (SM==null). It is - // possible that the SM==null for now but then SM is NOT null later - // when safeTrim() is invoked...the "safe" way to do is to redundant - // check (... && (isTrusted || SM == null || getClassLoader0())) in trim - // but it then can be argued that the SM is null when the operation - // is started... CharsetDecoder cd = charset.newDecoder(); // ArrayDecoder fastpaths if (cd instanceof ArrayDecoder ad) { @@ -754,7 +747,7 @@ static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingExce } } - static String newStringNoRepl1(byte[] src, Charset cs) { + private static String newStringNoRepl1(byte[] src, Charset cs) { int len = src.length; if (len == 0) { return ""; diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 32d72b9823305..b455e167bf7fc 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -131,17 +131,17 @@ private StringEncoder(Charset cs, String rcn) { ((ArrayEncoder)ce).isASCIICompatible(); } - String charsetName() { + private String charsetName() { if (cs instanceof HistoricallyNamedCharset) return ((HistoricallyNamedCharset)cs).historicalName(); return cs.name(); } - final String requestedCharsetName() { + private final String requestedCharsetName() { return requestedCharsetName; } - byte[] encode(byte coder, byte[] val) { + private byte[] encode(byte coder, byte[] val) { // fastpath for ascii compatible if (coder == LATIN1 && isASCIICompatible && !hasNegatives(val, 0, val.length)) { @@ -346,7 +346,7 @@ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { dp = dp + ret; if (ret != len) { if (!doReplace) { - throwUnmappable(sp, 1); + throwUnmappable(sp); } char c = StringUTF16.getChar(val, sp++); if (Character.isHighSurrogate(c) && sp < sl && @@ -386,7 +386,7 @@ private static boolean isMalformed4(int b2, int b3, int b4) { } private static boolean isMalformed4_2(int b1, int b2) { - return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || + return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80; } @@ -397,8 +397,8 @@ private static boolean isMalformed4_3(int b3) { static char decode2(int b1, int b2) { return (char)(((b1 << 6) ^ b2) ^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); } private static char decode3(int b1, int b2, int b3) { @@ -421,46 +421,46 @@ private static int decode4(int b1, int b2, int b3, int b4) { ((byte) 0x80 << 0)))); } - static int decodeUTF8_UTF16(byte[] bytes, int offset, int sl, byte[] dst, int dp, boolean doReplace) { - while (offset < sl) { - int b1 = bytes[offset++]; + static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { + while (sp < sl) { + int b1 = src[sp++]; if (b1 >= 0) { putChar(dst, dp++, (char) b1); } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (offset < sl) { - int b2 = bytes[offset++]; + if (sp < sl) { + int b2 = src[sp++]; if (StringCoding.isNotContinuation(b2)) { if (!doReplace) { - throwMalformed(offset - 1, 1); + throwMalformed(sp - 1, 1); } putChar(dst, dp++, REPL); - offset--; + sp--; } else { putChar(dst, dp++, decode2(b1, b2)); } continue; } if (!doReplace) { - throwMalformed(offset, 1); // underflow() + throwMalformed(sp, 1); // underflow() } putChar(dst, dp++, REPL); break; } else if ((b1 >> 4) == -2) { - if (offset + 1 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; + if (sp + 1 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; if (isMalformed3(b1, b2, b3)) { if (!doReplace) { - throwMalformed(offset - 3, 3); + throwMalformed(sp - 3, 3); } putChar(dst, dp++, REPL); - offset -= 3; - offset += malformedN(bytes, offset, 3); + sp -= 3; + sp += malformedN(src, sp, 3); } else { char c = decode3(b1, b2, b3); if (Character.isSurrogate(c)) { if (!doReplace) { - throwMalformed(offset - 3, 3); + throwMalformed(sp - 3, 3); } putChar(dst, dp++, REPL); } else { @@ -469,32 +469,32 @@ static int decodeUTF8_UTF16(byte[] bytes, int offset, int sl, byte[] dst, int dp } continue; } - if (offset < sl && isMalformed3_2(b1, bytes[offset])) { + if (sp < sl && isMalformed3_2(b1, src[sp])) { if (!doReplace) { - throwMalformed(offset - 1, 2); + throwMalformed(sp - 1, 2); } putChar(dst, dp++, REPL); continue; } if (!doReplace) { - throwMalformed(offset, 1); + throwMalformed(sp, 1); } putChar(dst, dp++, REPL); break; } else if ((b1 >> 3) == -2) { - if (offset + 2 < sl) { - int b2 = bytes[offset++]; - int b3 = bytes[offset++]; - int b4 = bytes[offset++]; + if (sp + 2 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + int b4 = src[sp++]; int uc = decode4(b1, b2, b3, b4); if (isMalformed4(b2, b3, b4) || !Character.isSupplementaryCodePoint(uc)) { // shortest form check if (!doReplace) { - throwMalformed(offset - 4, 4); + throwMalformed(sp - 4, 4); } putChar(dst, dp++, REPL); - offset -= 4; - offset += StringCoding.malformedN(bytes, offset, 4); + sp -= 4; + sp += StringCoding.malformedN(src, sp, 4); } else { putChar(dst, dp++, Character.highSurrogate(uc)); putChar(dst, dp++, Character.lowSurrogate(uc)); @@ -502,26 +502,25 @@ static int decodeUTF8_UTF16(byte[] bytes, int offset, int sl, byte[] dst, int dp continue; } b1 &= 0xff; - if (b1 > 0xf4 || - offset < sl && StringCoding.isMalformed4_2(b1, bytes[offset] & 0xff)) { + if (b1 > 0xf4 || sp < sl && StringCoding.isMalformed4_2(b1, src[sp] & 0xff)) { if (!doReplace) { - throwMalformed(offset - 1, 1); // or 2 + throwMalformed(sp - 1, 1); // or 2 } putChar(dst, dp++, REPL); continue; } if (!doReplace) { - throwMalformed(offset - 1, 1); + throwMalformed(sp - 1, 1); } - offset++; + sp++; putChar(dst, dp++, REPL); - if (offset < sl && StringCoding.isMalformed4_3(bytes[offset])) { + if (sp < sl && StringCoding.isMalformed4_3(src[sp])) { continue; } break; } else { if (!doReplace) { - throwMalformed(offset - 1, 1); + throwMalformed(sp - 1, 1); } putChar(dst, dp++, REPL); } @@ -551,7 +550,7 @@ static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offs static int malformedN(byte[] src, int sp, int nb) { if (nb == 3) { int b1 = src[sp++]; - int b2 = src[sp++]; // no need to lookup b3 + int b2 = src[sp]; // no need to lookup b3 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || isNotContinuation(b2)) ? 1 : 2; } else if (nb == 4) { // we don't care the speed here @@ -562,7 +561,7 @@ static int malformedN(byte[] src, int sp, int nb) { (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || isNotContinuation(b2)) return 1; - if (isNotContinuation(src[sp++])) + if (isNotContinuation(src[sp])) return 2; return 3; } @@ -570,7 +569,7 @@ static int malformedN(byte[] src, int sp, int nb) { return -1; } - static void throwMalformed(int off, int nb) { + private static void throwMalformed(int off, int nb) { String msg = "malformed input off : " + off + ", length : " + nb; throw new IllegalArgumentException(msg, new MalformedInputException(nb)); } @@ -581,15 +580,15 @@ static void throwMalformed(byte[] val) { throwMalformed(dp, 1); } - static void throwUnmappable(int off, int nb) { - String msg = "malformed input off : " + off + ", length : " + nb; - throw new IllegalArgumentException(msg, new UnmappableCharacterException(nb)); + private static void throwUnmappable(int off) { + String msg = "malformed input off : " + off + ", length : 1"; + throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); } static void throwUnmappable(byte[] val) { int dp = 0; while (dp < val.length && val[dp] >=0) { dp++; } - throwUnmappable(dp, 1); + throwUnmappable(dp); } private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { @@ -601,11 +600,10 @@ private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { int dp = 0; byte[] dst = new byte[val.length << 1]; - for (int sp = 0; sp < val.length; sp++) { - byte c = val[sp]; + for (byte c : val) { if (c < 0) { - dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); - dst[dp++] = (byte)(0x80 | (c & 0x3f)); + dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); + dst[dp++] = (byte) (0x80 | (c & 0x3f)); } else { dst[dp++] = c; } @@ -644,7 +642,7 @@ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { if (doReplace) { dst[dp++] = '?'; } else { - throwUnmappable(sp - 1, 1); // or 2, does not matter here + throwUnmappable(sp - 1); } } else { dst[dp++] = (byte)(0xf0 | ((uc >> 18))); From a45b761df1e5f3e2caf078e21f020c10c453b443 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 21 Jan 2021 20:48:23 +0100 Subject: [PATCH 21/25] Move most of the encode/decode code to String, remove StringEncoder and the ThreadLocal encoder facility. --- .../share/classes/java/lang/String.java | 593 ++++++++++++++- .../share/classes/java/lang/StringCoding.java | 698 +----------------- .../share/classes/java/lang/System.java | 4 +- .../openjdk/bench/java/lang/StringEncode.java | 88 +++ 4 files changed, 667 insertions(+), 716 deletions(-) create mode 100644 test/micro/org/openjdk/bench/java/lang/StringEncode.java diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 7608452b8d1d7..1ed1bedb6c080 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -54,11 +54,11 @@ import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.Stable; import sun.nio.cs.ArrayDecoder; +import sun.nio.cs.ArrayEncoder; import static java.lang.StringCoding.ISO_8859_1; import static java.lang.StringCoding.US_ASCII; import static java.lang.StringCoding.UTF_8; -import static java.util.function.Predicate.not; /** * The {@code String} class represents character strings. All @@ -483,7 +483,7 @@ public String(byte ascii[], int hibyte) { */ public String(byte[] bytes, int offset, int length, String charsetName) throws UnsupportedEncodingException { - this(bytes, offset, length, StringCoding.lookupCharset(charsetName)); + this(bytes, offset, length, lookupCharset(charsetName)); } /** @@ -522,13 +522,10 @@ public String(byte[] bytes, int offset, int length, Charset charset) { if (length == 0) { this.value = "".value; this.coder = "".coder; - return; - } - if (charset == UTF_8) { + } else if (charset == UTF_8) { if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; - return; } else { int sl = offset + length; int dp = 0; @@ -545,8 +542,8 @@ public String(byte[] bytes, int offset, int length, Charset charset) { if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && offset + 1 < sl) { int b2 = bytes[offset + 1]; - if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte)StringCoding.decode2(b1, b2); + if (!isNotContinuation(b2)) { + dst[dp++] = (byte)decode2(b1, b2); offset += 2; continue; } @@ -571,7 +568,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } - dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); + dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); } @@ -595,7 +592,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { int dp = 0; while (dp < length) { int b = bytes[offset++]; - StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : StringCoding.REPL); + StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : REPL); } this.value = dst; this.coder = UTF16; @@ -633,7 +630,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { return; } - int en = StringCoding.scale(length, cd.maxCharsPerByte()); + int en = scale(length, cd.maxCharsPerByte()); cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = new char[en]; @@ -652,7 +649,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { } // decode using CharsetDecoder - int en = StringCoding.scale(length, cd.maxCharsPerByte()); + int en = scale(length, cd.maxCharsPerByte()); cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = new char[en]; @@ -662,7 +659,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { offset = 0; } - int caLen = StringCoding.decodeWithDecoder(cd, ca, bytes, offset, length); + int caLen = decodeWithDecoder(cd, ca, bytes, offset, length); if (COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(ca, 0, caLen); if (bs != null) { @@ -702,8 +699,8 @@ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length) { if ((b1 == (byte) 0xc2 || b1 == (byte) 0xc3) && offset + 1 < sl) { int b2 = bytes[offset + 1]; - if (!StringCoding.isNotContinuation(b2)) { - dst[dp++] = (byte) StringCoding.decode2(b1, b2); + if (!isNotContinuation(b2)) { + dst[dp++] = (byte) decode2(b1, b2); offset += 2; continue; } @@ -726,7 +723,7 @@ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length) { StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } - dp = StringCoding.decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); + dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); } @@ -766,23 +763,24 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { return new String(src, LATIN1); return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } else { - StringCoding.throwMalformed(src); + throwMalformed(src); } } CharsetDecoder cd = cs.newDecoder(); // ascii fastpath if (cd instanceof ArrayDecoder ad && - ad.isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { + ad.isASCIICompatible() && + !StringCoding.hasNegatives(src, 0, src.length)) { return new String(src, 0, src.length, ISO_8859_1); } - int en = StringCoding.scale(len, cd.maxCharsPerByte()); + int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; if (cs.getClass().getClassLoader0() != null && System.getSecurityManager() != null) { src = Arrays.copyOf(src, len); } - int caLen = StringCoding.decodeWithDecoder(cd, ca, src, 0, src.length); + int caLen = decodeWithDecoder(cd, ca, src, 0, src.length); if (COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(ca, 0, caLen); if (bs != null) { @@ -792,6 +790,555 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); } + private static final char REPL = '\ufffd'; + + // Trim the given byte array to the given length + private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { + if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) { + return ba; + } else { + return Arrays.copyOf(ba, len); + } + } + + private static int scale(int len, float expansionFactor) { + // We need to perform double, not float, arithmetic; otherwise + // we lose low order bits when len is larger than 2**24. + return (int)(len * (double)expansionFactor); + } + + private static Charset lookupCharset(String csn) throws UnsupportedEncodingException { + Objects.requireNonNull(csn); + try { + return Charset.forName(csn); + } catch (UnsupportedCharsetException | IllegalCharsetNameException x) { + throw new UnsupportedEncodingException(csn); + } + } + + private static byte[] encode(String csn, byte coder, byte[] val) + throws UnsupportedEncodingException + { + return encode(lookupCharset(csn), coder, val); + } + + private static byte[] encode(Charset cs, byte coder, byte[] val) { + if (val.length == 0) { + return "".value(); + } + if (cs == UTF_8) { + return encodeUTF8(coder, val, true); + } + if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } + if (cs == US_ASCII) { + return encodeASCII(coder, val); + } + CharsetEncoder ce = cs.newEncoder(); + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; + int en = scale(len, ce.maxBytesPerChar()); + if (ce instanceof ArrayEncoder ae) { + // fastpath for ascii compatible + if (coder == LATIN1 && + ae.isASCIICompatible() && + !StringCoding.hasNegatives(val, 0, val.length)) { + return Arrays.copyOf(val, val.length); + } + byte[] ba = new byte[en]; + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + + int blen = (coder == LATIN1) ? ae.encodeFromLatin1(val, 0, len, ba) + : ae.encodeFromUTF16(val, 0, len, ba); + if (blen != -1) { + return safeTrim(ba, blen, true); + } + } + + byte[] ba = new byte[en]; + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + try { + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + throw new Error(x); + } + return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null); + } + + private static byte[] encode(byte coder, byte[] val) { + return encode(Charset.defaultCharset(), coder, val); + } + + private static byte[] encodeASCII(byte coder, byte[] val) { + if (coder == LATIN1) { + byte[] dst = new byte[val.length]; + for (int i = 0; i < val.length; i++) { + if (val[i] < 0) { + dst[i] = '?'; + } else { + dst[i] = val[i]; + } + } + return dst; + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + for (int i = 0; i < len; i++) { + char c = StringUTF16.getChar(val, i); + if (c < 0x80) { + dst[dp++] = (byte)c; + continue; + } + if (Character.isHighSurrogate(c) && i + 1 < len && + Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { + i++; + } + dst[dp++] = '?'; + } + if (len == dp) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + private static byte[] encode8859_1(byte coder, byte[] val) { + return encode8859_1(coder, val, true); + } + + private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { + if (coder == LATIN1) { + return Arrays.copyOf(val, val.length); + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + int sp = 0; + int sl = len; + while (sp < sl) { + int ret = StringCoding.implEncodeISOArray(val, sp, dst, dp, len); + sp = sp + ret; + dp = dp + ret; + if (ret != len) { + if (!doReplace) { + throwUnmappable(sp); + } + char c = StringUTF16.getChar(val, sp++); + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { + sp++; + } + dst[dp++] = '?'; + len = sl - sp; + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + //////////////////////////////// utf8 //////////////////////////////////// + + + private static boolean isNotContinuation(int b) { + return (b & 0xc0) != 0x80; + } + + private static boolean isMalformed3(int b1, int b2, int b3) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; + } + + private static boolean isMalformed3_2(int b1, int b2) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80; + } + + private static boolean isMalformed4(int b2, int b3, int b4) { + return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || + (b4 & 0xc0) != 0x80; + } + + private static boolean isMalformed4_2(int b1, int b2) { + return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || + (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || + (b2 & 0xc0) != 0x80; + } + + private static boolean isMalformed4_3(int b3) { + return (b3 & 0xc0) != 0x80; + } + + private static char decode2(int b1, int b2) { + return (char)(((b1 << 6) ^ b2) ^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + } + + private static char decode3(int b1, int b2, int b3) { + return (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + } + + private static int decode4(int b1, int b2, int b3, int b4) { + return ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + } + + private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { + while (sp < sl) { + int b1 = src[sp++]; + if (b1 >= 0) { + StringUTF16.putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (sp < sl) { + int b2 = src[sp++]; + if (isNotContinuation(b2)) { + if (!doReplace) { + throwMalformed(sp - 1, 1); + } + StringUTF16.putChar(dst, dp++, REPL); + sp--; + } else { + StringUTF16.putChar(dst, dp++, decode2(b1, b2)); + } + continue; + } + if (!doReplace) { + throwMalformed(sp, 1); // underflow() + } + StringUTF16.putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 4) == -2) { + if (sp + 1 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + if (isMalformed3(b1, b2, b3)) { + if (!doReplace) { + throwMalformed(sp - 3, 3); + } + StringUTF16.putChar(dst, dp++, REPL); + sp -= 3; + sp += malformed3(src, sp); + } else { + char c = decode3(b1, b2, b3); + if (Character.isSurrogate(c)) { + if (!doReplace) { + throwMalformed(sp - 3, 3); + } + StringUTF16.putChar(dst, dp++, REPL); + } else { + StringUTF16.putChar(dst, dp++, c); + } + } + continue; + } + if (sp < sl && isMalformed3_2(b1, src[sp])) { + if (!doReplace) { + throwMalformed(sp - 1, 2); + } + StringUTF16.putChar(dst, dp++, REPL); + continue; + } + if (!doReplace) { + throwMalformed(sp, 1); + } + StringUTF16.putChar(dst, dp++, REPL); + break; + } else if ((b1 >> 3) == -2) { + if (sp + 2 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + int b4 = src[sp++]; + int uc = decode4(b1, b2, b3, b4); + if (isMalformed4(b2, b3, b4) || + !Character.isSupplementaryCodePoint(uc)) { // shortest form check + if (!doReplace) { + throwMalformed(sp - 4, 4); + } + StringUTF16.putChar(dst, dp++, REPL); + sp -= 4; + sp += malformed4(src, sp); + } else { + StringUTF16.putChar(dst, dp++, Character.highSurrogate(uc)); + StringUTF16.putChar(dst, dp++, Character.lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { + if (!doReplace) { + throwMalformed(sp - 1, 1); // or 2 + } + StringUTF16.putChar(dst, dp++, REPL); + continue; + } + if (!doReplace) { + throwMalformed(sp - 1, 1); + } + sp++; + StringUTF16.putChar(dst, dp++, REPL); + if (sp < sl && isMalformed4_3(src[sp])) { + continue; + } + break; + } else { + if (!doReplace) { + throwMalformed(sp - 1, 1); + } + StringUTF16.putChar(dst, dp++, REPL); + } + } + return dp; + } + + private static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) { + ByteBuffer bb = ByteBuffer.wrap(src, offset, length); + CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + return cb.position(); + } + + private static int malformed3(byte[] src, int sp) { + int b1 = src[sp++]; + int b2 = src[sp]; // no need to lookup b3 + return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + isNotContinuation(b2)) ? 1 : 2; + } + + private static int malformed4(byte[] src, int sp) { + // we don't care the speed here + int b1 = src[sp++] & 0xff; + int b2 = src[sp++] & 0xff; + if (b1 > 0xf4 || + (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || + (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || + isNotContinuation(b2)) + return 1; + if (isNotContinuation(src[sp])) + return 2; + return 3; + } + + private static void throwMalformed(int off, int nb) { + String msg = "malformed input off : " + off + ", length : " + nb; + throw new IllegalArgumentException(msg, new MalformedInputException(nb)); + } + + private static void throwMalformed(byte[] val) { + int dp = 0; + while (dp < val.length && val[dp] >=0) { dp++; } + throwMalformed(dp, 1); + } + + private static void throwUnmappable(int off) { + String msg = "malformed input off : " + off + ", length : 1"; + throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); + } + + private static void throwUnmappable(byte[] val) { + int dp = 0; + while (dp < val.length && val[dp] >=0) { dp++; } + throwUnmappable(dp); + } + + private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { + if (coder == UTF16) + return encodeUTF8_UTF16(val, doReplace); + + if (!StringCoding.hasNegatives(val, 0, val.length)) + return Arrays.copyOf(val, val.length); + + int dp = 0; + byte[] dst = new byte[val.length << 1]; + for (byte c : val) { + if (c < 0) { + dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); + dst[dp++] = (byte) (0x80 | (c & 0x3f)); + } else { + dst[dp++] = c; + } + } + if (dp == dst.length) + return dst; + return Arrays.copyOf(dst, dp); + } + + private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { + int dp = 0; + int sp = 0; + int sl = val.length >> 1; + byte[] dst = new byte[sl * 3]; + char c; + while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { + // ascii fast loop; + dst[dp++] = (byte)c; + sp++; + } + while (sp < sl) { + c = StringUTF16.getChar(val, sp++); + if (c < 0x80) { + dst[dp++] = (byte)c; + } else if (c < 0x800) { + dst[dp++] = (byte)(0xc0 | (c >> 6)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } else if (Character.isSurrogate(c)) { + int uc = -1; + char c2; + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { + uc = Character.toCodePoint(c, c2); + } + if (uc < 0) { + if (doReplace) { + dst[dp++] = '?'; + } else { + throwUnmappable(sp - 1); + } + } else { + dst[dp++] = (byte)(0xf0 | ((uc >> 18))); + dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); + dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (uc & 0x3f)); + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + dst[dp++] = (byte)(0xe0 | ((c >> 12))); + dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + /* + * Throws iae, instead of replacing, if unmappable. + */ + static byte[] getBytesUTF8NoRepl(String s) { + return encodeUTF8(s.coder(), s.value(), false); + } + + ////////////////////// for j.n.f.Files ////////////////////////// + + private static boolean isASCII(byte[] src) { + return !StringCoding.hasNegatives(src, 0, src.length); + } + + /* + * Throws CCE, instead of replacing, if unmappable. + */ + static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { + try { + return getBytesNoRepl1(s, cs); + } catch (IllegalArgumentException e) { + //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause + Throwable cause = e.getCause(); + if (cause instanceof UnmappableCharacterException) { + throw (UnmappableCharacterException)cause; + } + throw (CharacterCodingException)cause; + } + } + + private static byte[] getBytesNoRepl1(String s, Charset cs) { + byte[] val = s.value(); + byte coder = s.coder(); + if (cs == UTF_8) { + if (coder == LATIN1 && isASCII(val)) { + return val; + } + return encodeUTF8(coder, val, false); + } + if (cs == ISO_8859_1) { + if (coder == LATIN1) { + return val; + } + return encode8859_1(coder, val, false); + } + if (cs == US_ASCII) { + if (coder == LATIN1) { + if (isASCII(val)) { + return val; + } else { + throwUnmappable(val); + } + } + } + CharsetEncoder ce = cs.newEncoder(); + // fastpath for ascii compatible + if (coder == LATIN1 && + ce instanceof ArrayEncoder ae && + ae.isASCIICompatible() && + isASCII(val)) { + return val; + } + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; + int en = scale(len, ce.maxBytesPerChar()); + byte[] ba = new byte[en]; + if (len == 0) { + return ba; + } + if (ce instanceof ArrayEncoder ae) { + int blen = (coder == LATIN1 ) ? ae.encodeFromLatin1(val, 0, len, ba) + : ae.encodeFromUTF16(val, 0, len, ba); + if (blen != -1) { + return safeTrim(ba, blen, true); + } + } + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + try { + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + throw new IllegalArgumentException(x); + } + return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null ); + } + /** * Constructs a new {@code String} by decoding the specified array of bytes * using the specified {@linkplain java.nio.charset.Charset charset}. The @@ -1219,7 +1766,7 @@ public void getBytes(int srcBegin, int srcEnd, byte dst[], int dstBegin) { public byte[] getBytes(String charsetName) throws UnsupportedEncodingException { if (charsetName == null) throw new NullPointerException(); - return StringCoding.encode(charsetName, coder(), value); + return encode(charsetName, coder(), value); } /** @@ -1242,7 +1789,7 @@ public byte[] getBytes(String charsetName) */ public byte[] getBytes(Charset charset) { if (charset == null) throw new NullPointerException(); - return StringCoding.encode(charset, coder(), value); + return encode(charset, coder(), value); } /** @@ -1259,7 +1806,7 @@ public byte[] getBytes(Charset charset) { * @since 1.1 */ public byte[] getBytes() { - return StringCoding.encode(coder(), value); + return encode(coder(), value); } /** diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index b455e167bf7fc..22439ed6ea30b 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -25,255 +25,22 @@ package java.lang; -import java.io.UnsupportedEncodingException; -import java.lang.ref.SoftReference; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CharacterCodingException; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; -import java.nio.charset.IllegalCharsetNameException; -import java.nio.charset.MalformedInputException; -import java.nio.charset.UnmappableCharacterException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.Arrays; -import java.util.Objects; import jdk.internal.vm.annotation.IntrinsicCandidate; -import sun.nio.cs.HistoricallyNamedCharset; -import sun.nio.cs.ArrayEncoder; -import static java.lang.String.LATIN1; -import static java.lang.String.UTF16; -import static java.lang.StringUTF16.putChar; /** * Utility class for string encoding and decoding. */ - class StringCoding { private StringCoding() { } - /** The cached coders for each thread */ - private static final ThreadLocal> encoder = - new ThreadLocal<>(); - static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; - static final char REPL = '\ufffd'; - - private static T deref(ThreadLocal> tl) { - SoftReference sr = tl.get(); - if (sr == null) - return null; - return sr.get(); - } - - private static void set(ThreadLocal> tl, T ob) { - tl.set(new SoftReference<>(ob)); - } - - // Trim the given byte array to the given length - private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { - if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) - return ba; - else - return Arrays.copyOf(ba, len); - } - - static int scale(int len, float expansionFactor) { - // We need to perform double, not float, arithmetic; otherwise - // we lose low order bits when len is larger than 2**24. - return (int)(len * (double)expansionFactor); - } - - static Charset lookupCharset(String csn) throws UnsupportedEncodingException { - Objects.requireNonNull(csn); - try { - return Charset.forName(csn); - } catch (UnsupportedCharsetException | IllegalCharsetNameException x) { - throw new UnsupportedEncodingException(csn); - } - } - - @IntrinsicCandidate - public static boolean hasNegatives(byte[] ba, int off, int len) { - for (int i = off; i < off + len; i++) { - if (ba[i] < 0) { - return true; - } - } - return false; - } - - // -- Encoding -- - private static class StringEncoder { - private Charset cs; - private CharsetEncoder ce; - private final boolean isASCIICompatible; - private final String requestedCharsetName; - private final boolean isTrusted; - - private StringEncoder(Charset cs, String rcn) { - this.requestedCharsetName = rcn; - this.cs = cs; - this.ce = cs.newEncoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - this.isTrusted = (cs.getClass().getClassLoader0() == null); - this.isASCIICompatible = (ce instanceof ArrayEncoder) && - ((ArrayEncoder)ce).isASCIICompatible(); - } - - private String charsetName() { - if (cs instanceof HistoricallyNamedCharset) - return ((HistoricallyNamedCharset)cs).historicalName(); - return cs.name(); - } - - private final String requestedCharsetName() { - return requestedCharsetName; - } - - private byte[] encode(byte coder, byte[] val) { - // fastpath for ascii compatible - if (coder == LATIN1 && isASCIICompatible && - !hasNegatives(val, 0, val.length)) { - return Arrays.copyOf(val, val.length); - } - int len = val.length >> coder; // assume LATIN1=0/UTF16=1; - int en = scale(len, ce.maxBytesPerChar()); - byte[] ba = new byte[en]; - if (len == 0) { - return ba; - } - if (ce instanceof ArrayEncoder) { - int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) - : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); - if (blen != -1) { - return safeTrim(ba, blen, isTrusted); - } - } - char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) - : StringUTF16.toChars(val); - ce.reset(); - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, 0, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return safeTrim(ba, bb.position(), isTrusted); - } - } - - static byte[] encode(String csn, byte coder, byte[] val) - throws UnsupportedEncodingException - { - StringEncoder se = deref(encoder); - if ((se == null) || !(csn.equals(se.requestedCharsetName()) - || csn.equals(se.charsetName()))) { - Charset cs = lookupCharset(csn); - if (cs == UTF_8) { - return encodeUTF8(coder, val, true); - } - if (cs == ISO_8859_1) { - return encode8859_1(coder, val); - } - if (cs == US_ASCII) { - return encodeASCII(coder, val); - } - se = new StringEncoder(cs, csn); - set(encoder, se); - } - return se.encode(coder, val); - } - - static byte[] encode(Charset cs, byte coder, byte[] val) { - if (cs == UTF_8) { - return encodeUTF8(coder, val, true); - } - if (cs == ISO_8859_1) { - return encode8859_1(coder, val); - } - if (cs == US_ASCII) { - return encodeASCII(coder, val); - } - CharsetEncoder ce = cs.newEncoder(); - // fastpath for ascii compatible - if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && - ((ArrayEncoder)ce).isASCIICompatible() && - !hasNegatives(val, 0, val.length)))) { - return Arrays.copyOf(val, val.length); - } - int len = val.length >> coder; // assume LATIN1=0/UTF16=1; - int en = scale(len, ce.maxBytesPerChar()); - byte[] ba = new byte[en]; - if (len == 0) { - return ba; - } - ce.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE) - .reset(); - if (ce instanceof ArrayEncoder) { - int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) - : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); - if (blen != -1) { - return safeTrim(ba, blen, true); - } - } - boolean isTrusted = cs.getClass().getClassLoader0() == null || - System.getSecurityManager() == null; - char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) - : StringUTF16.toChars(val); - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, 0, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new Error(x); - } - return safeTrim(ba, bb.position(), isTrusted); - } - - static byte[] encode(byte coder, byte[] val) { - Charset cs = Charset.defaultCharset(); - if (cs == UTF_8) { - return encodeUTF8(coder, val, true); - } - if (cs == ISO_8859_1) { - return encode8859_1(coder, val); - } - if (cs == US_ASCII) { - return encodeASCII(coder, val); - } - StringEncoder se = deref(encoder); - if (se == null || !cs.name().equals(se.cs.name())) { - se = new StringEncoder(cs, cs.name()); - set(encoder, se); - } - return se.encode(coder, val); - } - /** * Print a message directly to stderr, bypassing all character conversion * methods. @@ -281,41 +48,18 @@ static byte[] encode(byte coder, byte[] val) { */ private static native void err(String msg); - private static byte[] encodeASCII(byte coder, byte[] val) { - if (coder == LATIN1) { - byte[] dst = new byte[val.length]; - for (int i = 0; i < val.length; i++) { - if (val[i] < 0) { - dst[i] = '?'; - } else { - dst[i] = val[i]; - } - } - return dst; - } - int len = val.length >> 1; - byte[] dst = new byte[len]; - int dp = 0; - for (int i = 0; i < len; i++) { - char c = StringUTF16.getChar(val, i); - if (c < 0x80) { - dst[dp++] = (byte)c; - continue; - } - if (Character.isHighSurrogate(c) && i + 1 < len && - Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { - i++; + @IntrinsicCandidate + public static boolean hasNegatives(byte[] ba, int off, int len) { + for (int i = off; i < off + len; i++) { + if (ba[i] < 0) { + return true; } - dst[dp++] = '?'; } - if (len == dp) { - return dst; - } - return Arrays.copyOf(dst, dp); + return false; } @IntrinsicCandidate - private static int implEncodeISOArray(byte[] sa, int sp, + public static int implEncodeISOArray(byte[] sa, int sp, byte[] da, int dp, int len) { int i = 0; for (; i < len; i++) { @@ -327,432 +71,4 @@ private static int implEncodeISOArray(byte[] sa, int sp, return i; } - private static byte[] encode8859_1(byte coder, byte[] val) { - return encode8859_1(coder, val, true); - } - - private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { - if (coder == LATIN1) { - return Arrays.copyOf(val, val.length); - } - int len = val.length >> 1; - byte[] dst = new byte[len]; - int dp = 0; - int sp = 0; - int sl = len; - while (sp < sl) { - int ret = implEncodeISOArray(val, sp, dst, dp, len); - sp = sp + ret; - dp = dp + ret; - if (ret != len) { - if (!doReplace) { - throwUnmappable(sp); - } - char c = StringUTF16.getChar(val, sp++); - if (Character.isHighSurrogate(c) && sp < sl && - Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { - sp++; - } - dst[dp++] = '?'; - len = sl - sp; - } - } - if (dp == dst.length) { - return dst; - } - return Arrays.copyOf(dst, dp); - } - - //////////////////////////////// utf8 //////////////////////////////////// - - - static boolean isNotContinuation(int b) { - return (b & 0xc0) != 0x80; - } - - private static boolean isMalformed3(int b1, int b2, int b3) { - return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || - (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; - } - - private static boolean isMalformed3_2(int b1, int b2) { - return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || - (b2 & 0xc0) != 0x80; - } - - private static boolean isMalformed4(int b2, int b3, int b4) { - return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || - (b4 & 0xc0) != 0x80; - } - - private static boolean isMalformed4_2(int b1, int b2) { - return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || - (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || - (b2 & 0xc0) != 0x80; - } - - private static boolean isMalformed4_3(int b3) { - return (b3 & 0xc0) != 0x80; - } - - static char decode2(int b1, int b2) { - return (char)(((b1 << 6) ^ b2) ^ - (((byte) 0xC0 << 6) ^ - ((byte) 0x80 << 0))); - } - - private static char decode3(int b1, int b2, int b3) { - return (char)((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - } - - private static int decode4(int b1, int b2, int b3, int b4) { - return ((b1 << 18) ^ - (b2 << 12) ^ - (b3 << 6) ^ - (b4 ^ - (((byte) 0xF0 << 18) ^ - ((byte) 0x80 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0)))); - } - - static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { - while (sp < sl) { - int b1 = src[sp++]; - if (b1 >= 0) { - putChar(dst, dp++, (char) b1); - } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { - if (sp < sl) { - int b2 = src[sp++]; - if (StringCoding.isNotContinuation(b2)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - putChar(dst, dp++, REPL); - sp--; - } else { - putChar(dst, dp++, decode2(b1, b2)); - } - continue; - } - if (!doReplace) { - throwMalformed(sp, 1); // underflow() - } - putChar(dst, dp++, REPL); - break; - } else if ((b1 >> 4) == -2) { - if (sp + 1 < sl) { - int b2 = src[sp++]; - int b3 = src[sp++]; - if (isMalformed3(b1, b2, b3)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } - putChar(dst, dp++, REPL); - sp -= 3; - sp += malformedN(src, sp, 3); - } else { - char c = decode3(b1, b2, b3); - if (Character.isSurrogate(c)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } - putChar(dst, dp++, REPL); - } else { - putChar(dst, dp++, c); - } - } - continue; - } - if (sp < sl && isMalformed3_2(b1, src[sp])) { - if (!doReplace) { - throwMalformed(sp - 1, 2); - } - putChar(dst, dp++, REPL); - continue; - } - if (!doReplace) { - throwMalformed(sp, 1); - } - putChar(dst, dp++, REPL); - break; - } else if ((b1 >> 3) == -2) { - if (sp + 2 < sl) { - int b2 = src[sp++]; - int b3 = src[sp++]; - int b4 = src[sp++]; - int uc = decode4(b1, b2, b3, b4); - if (isMalformed4(b2, b3, b4) || - !Character.isSupplementaryCodePoint(uc)) { // shortest form check - if (!doReplace) { - throwMalformed(sp - 4, 4); - } - putChar(dst, dp++, REPL); - sp -= 4; - sp += StringCoding.malformedN(src, sp, 4); - } else { - putChar(dst, dp++, Character.highSurrogate(uc)); - putChar(dst, dp++, Character.lowSurrogate(uc)); - } - continue; - } - b1 &= 0xff; - if (b1 > 0xf4 || sp < sl && StringCoding.isMalformed4_2(b1, src[sp] & 0xff)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); // or 2 - } - putChar(dst, dp++, REPL); - continue; - } - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - sp++; - putChar(dst, dp++, REPL); - if (sp < sl && StringCoding.isMalformed4_3(src[sp])) { - continue; - } - break; - } else { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } - putChar(dst, dp++, REPL); - } - } - return dp; - } - - static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) { - ByteBuffer bb = ByteBuffer.wrap(src, offset, length); - CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return cb.position(); - } - - // for nb == 3/4 - static int malformedN(byte[] src, int sp, int nb) { - if (nb == 3) { - int b1 = src[sp++]; - int b2 = src[sp]; // no need to lookup b3 - return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || - isNotContinuation(b2)) ? 1 : 2; - } else if (nb == 4) { // we don't care the speed here - int b1 = src[sp++] & 0xff; - int b2 = src[sp++] & 0xff; - if (b1 > 0xf4 || - (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || - (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || - isNotContinuation(b2)) - return 1; - if (isNotContinuation(src[sp])) - return 2; - return 3; - } - assert false; - return -1; - } - - private static void throwMalformed(int off, int nb) { - String msg = "malformed input off : " + off + ", length : " + nb; - throw new IllegalArgumentException(msg, new MalformedInputException(nb)); - } - - static void throwMalformed(byte[] val) { - int dp = 0; - while (dp < val.length && val[dp] >=0) { dp++; } - throwMalformed(dp, 1); - } - - private static void throwUnmappable(int off) { - String msg = "malformed input off : " + off + ", length : 1"; - throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); - } - - static void throwUnmappable(byte[] val) { - int dp = 0; - while (dp < val.length && val[dp] >=0) { dp++; } - throwUnmappable(dp); - } - - private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { - if (coder == UTF16) - return encodeUTF8_UTF16(val, doReplace); - - if (!hasNegatives(val, 0, val.length)) - return Arrays.copyOf(val, val.length); - - int dp = 0; - byte[] dst = new byte[val.length << 1]; - for (byte c : val) { - if (c < 0) { - dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); - dst[dp++] = (byte) (0x80 | (c & 0x3f)); - } else { - dst[dp++] = c; - } - } - if (dp == dst.length) - return dst; - return Arrays.copyOf(dst, dp); - } - - private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { - int dp = 0; - int sp = 0; - int sl = val.length >> 1; - byte[] dst = new byte[sl * 3]; - char c; - while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { - // ascii fast loop; - dst[dp++] = (byte)c; - sp++; - } - while (sp < sl) { - c = StringUTF16.getChar(val, sp++); - if (c < 0x80) { - dst[dp++] = (byte)c; - } else if (c < 0x800) { - dst[dp++] = (byte)(0xc0 | (c >> 6)); - dst[dp++] = (byte)(0x80 | (c & 0x3f)); - } else if (Character.isSurrogate(c)) { - int uc = -1; - char c2; - if (Character.isHighSurrogate(c) && sp < sl && - Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { - uc = Character.toCodePoint(c, c2); - } - if (uc < 0) { - if (doReplace) { - dst[dp++] = '?'; - } else { - throwUnmappable(sp - 1); - } - } else { - dst[dp++] = (byte)(0xf0 | ((uc >> 18))); - dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); - dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); - dst[dp++] = (byte)(0x80 | (uc & 0x3f)); - sp++; // 2 chars - } - } else { - // 3 bytes, 16 bits - dst[dp++] = (byte)(0xe0 | ((c >> 12))); - dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); - dst[dp++] = (byte)(0x80 | (c & 0x3f)); - } - } - if (dp == dst.length) { - return dst; - } - return Arrays.copyOf(dst, dp); - } - - /* - * Throws iae, instead of replacing, if unmappable. - */ - static byte[] getBytesUTF8NoRepl(String s) { - return encodeUTF8(s.coder(), s.value(), false); - } - - ////////////////////// for j.n.f.Files ////////////////////////// - - private static boolean isASCII(byte[] src) { - return !hasNegatives(src, 0, src.length); - } - - /* - * Throws CCE, instead of replacing, if unmappable. - */ - static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { - try { - return getBytesNoRepl1(s, cs); - } catch (IllegalArgumentException e) { - //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause - Throwable cause = e.getCause(); - if (cause instanceof UnmappableCharacterException) { - throw (UnmappableCharacterException)cause; - } - throw (CharacterCodingException)cause; - } - } - - static byte[] getBytesNoRepl1(String s, Charset cs) { - byte[] val = s.value(); - byte coder = s.coder(); - if (cs == UTF_8) { - if (coder == LATIN1 && isASCII(val)) { - return val; - } - return encodeUTF8(coder, val, false); - } - if (cs == ISO_8859_1) { - if (coder == LATIN1) { - return val; - } - return encode8859_1(coder, val, false); - } - if (cs == US_ASCII) { - if (coder == LATIN1) { - if (isASCII(val)) { - return val; - } else { - throwUnmappable(val); - } - } - } - CharsetEncoder ce = cs.newEncoder(); - // fastpath for ascii compatible - if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && - ((ArrayEncoder)ce).isASCIICompatible() && - isASCII(val)))) { - return val; - } - int len = val.length >> coder; // assume LATIN1=0/UTF16=1; - int en = scale(len, ce.maxBytesPerChar()); - byte[] ba = new byte[en]; - if (len == 0) { - return ba; - } - if (ce instanceof ArrayEncoder) { - int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) - : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); - if (blen != -1) { - return safeTrim(ba, blen, true); - } - } - boolean isTrusted = cs.getClass().getClassLoader0() == null || - System.getSecurityManager() == null; - char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) - : StringUTF16.toChars(val); - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, 0, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new IllegalArgumentException(x); - } - return safeTrim(ba, bb.position(), isTrusted); - } } diff --git a/src/java.base/share/classes/java/lang/System.java b/src/java.base/share/classes/java/lang/System.java index e536818fbf6e9..db1b20b3fd703 100644 --- a/src/java.base/share/classes/java/lang/System.java +++ b/src/java.base/share/classes/java/lang/System.java @@ -2266,7 +2266,7 @@ public String newStringNoRepl(byte[] bytes, Charset cs) throws CharacterCodingEx } public byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { - return StringCoding.getBytesNoRepl(s, cs); + return String.getBytesNoRepl(s, cs); } public String newStringUTF8NoRepl(byte[] bytes, int off, int len) { @@ -2274,7 +2274,7 @@ public String newStringUTF8NoRepl(byte[] bytes, int off, int len) { } public byte[] getBytesUTF8NoRepl(String s) { - return StringCoding.getBytesUTF8NoRepl(s); + return String.getBytesUTF8NoRepl(s); } public void setCause(Throwable t, Throwable cause) { diff --git a/test/micro/org/openjdk/bench/java/lang/StringEncode.java b/test/micro/org/openjdk/bench/java/lang/StringEncode.java new file mode 100644 index 0000000000000..4cf5032a0dad3 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/StringEncode.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.nio.charset.Charset; +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(value = 3, jvmArgs = "-Xmx1g") +@Warmup(iterations = 5, time = 2) +@Measurement(iterations = 5, time = 3) +@State(Scope.Thread) +public class StringEncode { + + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Fork(value = 3, jvmArgs = "-Xmx1g") + @Warmup(iterations = 5, time = 2) + @Measurement(iterations = 5, time = 2) + @State(Scope.Thread) + public static class WithCharset { + + @Param({"US-ASCII", "ISO-8859-1", "UTF-8", "MS932", "ISO-8859-6"}) + private String charsetName; + + private Charset charset; + private String asciiString; + private String utf16String; + + @Setup + public void setup() { + charset = Charset.forName(charsetName); + asciiString = "ascii string"; + utf16String = "UTF-\uFF11\uFF16 string"; + } + + @Benchmark + public void encodeCharsetName(Blackhole bh) throws Exception { + bh.consume(asciiString.getBytes(charsetName)); + bh.consume(utf16String.getBytes(charsetName)); + } + + @Benchmark + public void encodeCharset(Blackhole bh) throws Exception { + bh.consume(asciiString.getBytes(charset)); + bh.consume(utf16String.getBytes(charset)); + } + } + + private String asciiDefaultString; + private String utf16DefaultString; + + @Setup + public void setup() { + asciiDefaultString = "ascii string"; + utf16DefaultString = "UTF-\uFF11\uFF16 string"; + } + + @Benchmark + public void encodeDefault(Blackhole bh) throws Exception { + bh.consume(asciiDefaultString.getBytes()); + bh.consume(utf16DefaultString.getBytes()); + } +} From 6808d4dbd72f6c3e4ba50c0e800c88fc6e469da8 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 21 Jan 2021 21:24:10 +0100 Subject: [PATCH 22/25] Simplify getBytes -> encode --- .../share/classes/java/lang/String.java | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 1ed1bedb6c080..dd6936471172f 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -816,16 +816,7 @@ private static Charset lookupCharset(String csn) throws UnsupportedEncodingExcep } } - private static byte[] encode(String csn, byte coder, byte[] val) - throws UnsupportedEncodingException - { - return encode(lookupCharset(csn), coder, val); - } - private static byte[] encode(Charset cs, byte coder, byte[] val) { - if (val.length == 0) { - return "".value(); - } if (cs == UTF_8) { return encodeUTF8(coder, val, true); } @@ -846,6 +837,9 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { return Arrays.copyOf(val, val.length); } byte[] ba = new byte[en]; + if (len == 0) { + return ba; + } ce.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); @@ -857,6 +851,9 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { } byte[] ba = new byte[en]; + if (len == 0) { + return ba; + } ce.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) @@ -876,10 +873,6 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null); } - private static byte[] encode(byte coder, byte[] val) { - return encode(Charset.defaultCharset(), coder, val); - } - private static byte[] encodeASCII(byte coder, byte[] val) { if (coder == LATIN1) { byte[] dst = new byte[val.length]; @@ -1766,7 +1759,7 @@ public void getBytes(int srcBegin, int srcEnd, byte dst[], int dstBegin) { public byte[] getBytes(String charsetName) throws UnsupportedEncodingException { if (charsetName == null) throw new NullPointerException(); - return encode(charsetName, coder(), value); + return encode(lookupCharset(charsetName), coder(), value); } /** @@ -1806,7 +1799,7 @@ public byte[] getBytes(Charset charset) { * @since 1.1 */ public byte[] getBytes() { - return encode(coder(), value); + return encode(Charset.defaultCharset(), coder(), value); } /** From 2143cb3e59e1062ea8465720714dbd6cd17edafc Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Thu, 21 Jan 2021 21:43:45 +0100 Subject: [PATCH 23/25] Reduce code duplication in getBytes/getBytesNoRepl --- .../share/classes/java/lang/String.java | 175 ++++++++---------- 1 file changed, 74 insertions(+), 101 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index dd6936471172f..f9c830bda1890 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -826,6 +826,10 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { if (cs == US_ASCII) { return encodeASCII(coder, val); } + return encodeWithEncoder(cs, coder, val, true); + } + + private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, boolean doReplace) { CharsetEncoder ce = cs.newEncoder(); int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); @@ -840,8 +844,10 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { if (len == 0) { return ba; } - ce.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); + if (doReplace) { + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + } int blen = (coder == LATIN1) ? ae.encodeFromLatin1(val, 0, len, ba) : ae.encodeFromUTF16(val, 0, len, ba); @@ -854,8 +860,10 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { if (len == 0) { return ba; } - ce.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); + if (doReplace) { + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + } char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ByteBuffer bb = ByteBuffer.wrap(ba); @@ -868,19 +876,75 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { - throw new Error(x); + if (doReplace) { + throw new IllegalArgumentException(x); + } else { + throw new Error(x); + } } return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null); } + /* + * Throws iae, instead of replacing, if unmappable. + */ + static byte[] getBytesUTF8NoRepl(String s) { + return encodeUTF8(s.coder(), s.value(), false); + } + + private static boolean isASCII(byte[] src) { + return !StringCoding.hasNegatives(src, 0, src.length); + } + + /* + * Throws CCE, instead of replacing, if unmappable. + */ + static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { + try { + return getBytesNoRepl1(s, cs); + } catch (IllegalArgumentException e) { + //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause + Throwable cause = e.getCause(); + if (cause instanceof UnmappableCharacterException) { + throw (UnmappableCharacterException)cause; + } + throw (CharacterCodingException)cause; + } + } + + private static byte[] getBytesNoRepl1(String s, Charset cs) { + byte[] val = s.value(); + byte coder = s.coder(); + if (cs == UTF_8) { + if (coder == LATIN1 && isASCII(val)) { + return val; + } + return encodeUTF8(coder, val, false); + } + if (cs == ISO_8859_1) { + if (coder == LATIN1) { + return val; + } + return encode8859_1(coder, val, false); + } + if (cs == US_ASCII) { + if (coder == LATIN1) { + if (isASCII(val)) { + return val; + } else { + throwUnmappable(val); + } + } + } + return encodeWithEncoder(cs, coder, val, false); + } + private static byte[] encodeASCII(byte coder, byte[] val) { if (coder == LATIN1) { - byte[] dst = new byte[val.length]; - for (int i = 0; i < val.length; i++) { - if (val[i] < 0) { + byte[] dst = Arrays.copyOf(val, val.length); + for (int i = 0; i < dst.length; i++) { + if (dst[i] < 0) { dst[i] = '?'; - } else { - dst[i] = val[i]; } } return dst; @@ -1241,97 +1305,6 @@ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { return Arrays.copyOf(dst, dp); } - /* - * Throws iae, instead of replacing, if unmappable. - */ - static byte[] getBytesUTF8NoRepl(String s) { - return encodeUTF8(s.coder(), s.value(), false); - } - - ////////////////////// for j.n.f.Files ////////////////////////// - - private static boolean isASCII(byte[] src) { - return !StringCoding.hasNegatives(src, 0, src.length); - } - - /* - * Throws CCE, instead of replacing, if unmappable. - */ - static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { - try { - return getBytesNoRepl1(s, cs); - } catch (IllegalArgumentException e) { - //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause - Throwable cause = e.getCause(); - if (cause instanceof UnmappableCharacterException) { - throw (UnmappableCharacterException)cause; - } - throw (CharacterCodingException)cause; - } - } - - private static byte[] getBytesNoRepl1(String s, Charset cs) { - byte[] val = s.value(); - byte coder = s.coder(); - if (cs == UTF_8) { - if (coder == LATIN1 && isASCII(val)) { - return val; - } - return encodeUTF8(coder, val, false); - } - if (cs == ISO_8859_1) { - if (coder == LATIN1) { - return val; - } - return encode8859_1(coder, val, false); - } - if (cs == US_ASCII) { - if (coder == LATIN1) { - if (isASCII(val)) { - return val; - } else { - throwUnmappable(val); - } - } - } - CharsetEncoder ce = cs.newEncoder(); - // fastpath for ascii compatible - if (coder == LATIN1 && - ce instanceof ArrayEncoder ae && - ae.isASCIICompatible() && - isASCII(val)) { - return val; - } - int len = val.length >> coder; // assume LATIN1=0/UTF16=1; - int en = scale(len, ce.maxBytesPerChar()); - byte[] ba = new byte[en]; - if (len == 0) { - return ba; - } - if (ce instanceof ArrayEncoder ae) { - int blen = (coder == LATIN1 ) ? ae.encodeFromLatin1(val, 0, len, ba) - : ae.encodeFromUTF16(val, 0, len, ba); - if (blen != -1) { - return safeTrim(ba, blen, true); - } - } - char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) - : StringUTF16.toChars(val); - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, 0, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new IllegalArgumentException(x); - } - return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null ); - } - /** * Constructs a new {@code String} by decoding the specified array of bytes * using the specified {@linkplain java.nio.charset.Charset charset}. The From feb8201dff898dd800161f83fcff0d001322780f Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 22 Jan 2021 00:14:47 +0100 Subject: [PATCH 24/25] Remove StringCoding Charset constants --- .../share/classes/java/lang/String.java | 32 +++++++++---------- .../share/classes/java/lang/StringCoding.java | 7 ---- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index f9c830bda1890..cbdb297e5970b 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -56,9 +56,9 @@ import sun.nio.cs.ArrayDecoder; import sun.nio.cs.ArrayEncoder; -import static java.lang.StringCoding.ISO_8859_1; -import static java.lang.StringCoding.US_ASCII; -import static java.lang.StringCoding.UTF_8; +import sun.nio.cs.ISO_8859_1; +import sun.nio.cs.US_ASCII; +import sun.nio.cs.UTF_8; /** * The {@code String} class represents character strings. All @@ -522,7 +522,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { if (length == 0) { this.value = "".value; this.coder = "".coder; - } else if (charset == UTF_8) { + } else if (charset == UTF_8.INSTANCE) { if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; @@ -575,7 +575,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { this.value = dst; this.coder = UTF16; } - } else if (charset == ISO_8859_1) { + } else if (charset == ISO_8859_1.INSTANCE) { if (COMPACT_STRINGS) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; @@ -583,7 +583,7 @@ public String(byte[] bytes, int offset, int length, Charset charset) { this.value = StringLatin1.inflate(bytes, offset, length); this.coder = UTF16; } - } else if (charset == US_ASCII) { + } else if (charset == US_ASCII.INSTANCE) { if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { this.value = Arrays.copyOfRange(bytes, offset, offset + length); this.coder = LATIN1; @@ -749,15 +749,15 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { if (len == 0) { return ""; } - if (cs == UTF_8) { + if (cs == UTF_8.INSTANCE) { return newStringUTF8NoRepl(src, 0, src.length); } - if (cs == ISO_8859_1) { + if (cs == ISO_8859_1.INSTANCE) { if (COMPACT_STRINGS) return new String(src, LATIN1); return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } - if (cs == US_ASCII) { + if (cs == US_ASCII.INSTANCE) { if (!StringCoding.hasNegatives(src, 0, src.length)) { if (COMPACT_STRINGS) return new String(src, LATIN1); @@ -772,7 +772,7 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { if (cd instanceof ArrayDecoder ad && ad.isASCIICompatible() && !StringCoding.hasNegatives(src, 0, src.length)) { - return new String(src, 0, src.length, ISO_8859_1); + return new String(src, 0, src.length, ISO_8859_1.INSTANCE); } int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; @@ -817,13 +817,13 @@ private static Charset lookupCharset(String csn) throws UnsupportedEncodingExcep } private static byte[] encode(Charset cs, byte coder, byte[] val) { - if (cs == UTF_8) { + if (cs == UTF_8.INSTANCE) { return encodeUTF8(coder, val, true); } - if (cs == ISO_8859_1) { + if (cs == ISO_8859_1.INSTANCE) { return encode8859_1(coder, val); } - if (cs == US_ASCII) { + if (cs == US_ASCII.INSTANCE) { return encodeASCII(coder, val); } return encodeWithEncoder(cs, coder, val, true); @@ -915,19 +915,19 @@ static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingExcepti private static byte[] getBytesNoRepl1(String s, Charset cs) { byte[] val = s.value(); byte coder = s.coder(); - if (cs == UTF_8) { + if (cs == UTF_8.INSTANCE) { if (coder == LATIN1 && isASCII(val)) { return val; } return encodeUTF8(coder, val, false); } - if (cs == ISO_8859_1) { + if (cs == ISO_8859_1.INSTANCE) { if (coder == LATIN1) { return val; } return encode8859_1(coder, val, false); } - if (cs == US_ASCII) { + if (cs == US_ASCII.INSTANCE) { if (coder == LATIN1) { if (isASCII(val)) { return val; diff --git a/src/java.base/share/classes/java/lang/StringCoding.java b/src/java.base/share/classes/java/lang/StringCoding.java index 22439ed6ea30b..4efa1a19c1d8b 100644 --- a/src/java.base/share/classes/java/lang/StringCoding.java +++ b/src/java.base/share/classes/java/lang/StringCoding.java @@ -25,11 +25,8 @@ package java.lang; -import java.nio.charset.Charset; - import jdk.internal.vm.annotation.IntrinsicCandidate; - /** * Utility class for string encoding and decoding. */ @@ -37,10 +34,6 @@ class StringCoding { private StringCoding() { } - static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; - static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; - static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; - /** * Print a message directly to stderr, bypassing all character conversion * methods. From 14928bfd9254422e56987f4bd1d4d1f79a7a5b62 Mon Sep 17 00:00:00 2001 From: Claes Redestad Date: Fri, 22 Jan 2021 00:15:35 +0100 Subject: [PATCH 25/25] Logic error in exception handling in encodeWithEncoder --- src/java.base/share/classes/java/lang/String.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index cbdb297e5970b..5c175b125dcb0 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -876,7 +876,7 @@ private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, bool if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { - if (doReplace) { + if (!doReplace) { throw new IllegalArgumentException(x); } else { throw new Error(x);