Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -2016,19 +2016,26 @@ public byte[] getBytes() {
return encode(Charset.defaultCharset(), coder(), value);
}

boolean bytesCompatible(Charset charset) {
boolean bytesCompatible(Charset charset, int srcIndex, int numChars) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surprisingly here we don't do anything for the case where the string is UTF16 and the target charset is also UTF16?

Copy link

@ExE-Boss ExE-Boss Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UTF‑16 Charsets disallow unpaired surrogates, which Java Strings allow.

So this can only return true for UTF‑16 when the platform and charset endianness match and the String doesn’t have any unpaired surrogates.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, tricky stuff -- we'll need to think more before changing this

if (isLatin1()) {
if (charset == ISO_8859_1.INSTANCE) {
return true; // ok, same encoding
} else if (charset == UTF_8.INSTANCE || charset == US_ASCII.INSTANCE) {
return !StringCoding.hasNegatives(value, 0, value.length); // ok, if ASCII-compatible
return !StringCoding.hasNegatives(value, srcIndex, numChars); // ok, if ASCII-compatible
}
}
return false;
}

void copyToSegmentRaw(MemorySegment segment, long offset) {
MemorySegment.copy(value, 0, segment, ValueLayout.JAVA_BYTE, offset, value.length);
void copyToSegmentRaw(MemorySegment segment, long offset, int srcIndex, int srcLength) {
if (!isLatin1()) {
// This method is intended to be used together with bytesCompatible, which currently only supports
// latin1 strings. In the future, bytesCompatible could be updated to handle more cases, like
// UTF-16 strings (when the platform and charset endianness match, and the String doesn’t contain
// unpaired surrogates). If that happens, copyToSegmentRaw should also be updated.
throw new IllegalStateException("This string does not support copyToSegmentRaw");
}
MemorySegment.copy(value, srcIndex, segment, ValueLayout.JAVA_BYTE, offset, srcLength);
}

/**
Expand Down
8 changes: 4 additions & 4 deletions src/java.base/share/classes/java/lang/System.java
Original file line number Diff line number Diff line change
Expand Up @@ -2315,13 +2315,13 @@ public String getLoaderNameID(ClassLoader loader) {
}

@Override
public void copyToSegmentRaw(String string, MemorySegment segment, long offset) {
string.copyToSegmentRaw(segment, offset);
public void copyToSegmentRaw(String string, MemorySegment segment, long offset, int srcIndex, int srcLength) {
string.copyToSegmentRaw(segment, offset, srcIndex, srcLength);
}

@Override
public boolean bytesCompatible(String string, Charset charset) {
return string.bytesCompatible(charset);
public boolean bytesCompatible(String string, Charset charset, int srcIndex, int numChars) {
return string.bytesCompatible(charset, srcIndex, numChars);
}
});
}
Expand Down
88 changes: 81 additions & 7 deletions src/java.base/share/classes/java/lang/foreign/MemorySegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -1296,12 +1296,7 @@ MemorySegment reinterpret(long newSize,
* over the decoding process is required.
* <p>
* Getting a string from a segment with a known byte offset and
* known byte length can be done like so:
* {@snippet lang=java :
* byte[] bytes = new byte[length];
* MemorySegment.copy(segment, JAVA_BYTE, offset, bytes, 0, length);
* return new String(bytes, charset);
* }
* known byte length can be done using {@link #getString(long, Charset, long)}.
*
* @param offset offset in bytes (relative to this segment address) at which this
* access operation will occur
Expand All @@ -1328,6 +1323,40 @@ MemorySegment reinterpret(long newSize,
*/
String getString(long offset, Charset charset);

/**
* Reads a string from this segment at the given offset, using the provided length
* and charset.
* <p>
* This method always replaces malformed-input and unmappable-character
* sequences with this charset's default replacement string. The {@link
* java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we say here, as you did for copy that this method ignores \0 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added:

If the string contains any {@code '\0'} characters, they will be read as well.

I suppose it might also make sense to update those warnings in setString and allocateFrom to mention that if you want to avoid truncating null-terminated strings, getString(long, Charset, long) could be used instead of getString(long). What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be a good idea, thanks!

* <p>
* If the string contains any {@code '\0'} characters, they will be read as well.
* This differs from {@link #getString(long, Charset)}, which will only read up
* to the first {@code '\0'}, resulting in truncation for string data that contains
* the {@code '\0'} character.
*
* @param offset offset in bytes (relative to this segment address) at which this
* access operation will occur
* @param charset the charset used to {@linkplain Charset#newDecoder() decode} the
* string bytes
* @param length length, in bytes, of the region of memory to read and decode into
* a string
* @return a Java string constructed from the bytes read from the given starting
* address up to the given length
* @throws IllegalArgumentException if the size of the string is greater than the
* largest string supported by the platform
* @throws IndexOutOfBoundsException if {@code offset < 0}
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - length}
* @throws IllegalStateException if the {@linkplain #scope() scope} associated with
* this segment is not {@linkplain Scope#isAlive() alive}
* @throws WrongThreadException if this method is called from a thread {@code T},
* such that {@code isAccessibleBy(T) == false}
* @throws IllegalArgumentException if {@code length < 0}
*/
String getString(long offset, Charset charset, long length);

/**
* Writes the given string into this segment at the given offset, converting it to
* a null-terminated byte sequence using the {@linkplain StandardCharsets#UTF_8 UTF-8}
Expand Down Expand Up @@ -1366,7 +1395,8 @@ MemorySegment reinterpret(long newSize,
* If the given string contains any {@code '\0'} characters, they will be
* copied as well. This means that, depending on the method used to read
* the string, such as {@link MemorySegment#getString(long)}, the string
* will appear truncated when read again.
* will appear truncated when read again. The string can be read without
* truncation using {@link #getString(long, Charset, long)}.
*
* @param offset offset in bytes (relative to this segment address) at which this
* access operation will occur, the final address of this write
Expand Down Expand Up @@ -2606,6 +2636,50 @@ static void copy(Object srcArray, int srcIndex,
elementCount);
}

/**
* Copies the byte sequence of the given string encoded using the provided charset
* to the destination segment.
* <p>
* This method always replaces malformed-input and unmappable-character
* sequences with this charset's default replacement string. The {@link
* java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required.
* <p>
* If the given string contains any {@code '\0'} characters, they will be
* copied as well. This means that, depending on the method used to read
* the string, such as {@link MemorySegment#getString(long)}, the string
* will appear truncated when read again. The string can be read without
* truncation using {@link #getString(long, Charset, long)}.
*
* @param src the Java string to be written into the destination segment
* @param dstEncoding the charset used to {@linkplain Charset#newEncoder() encode}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we have a dependency on the charset being standard?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not, thanks, fixed.

Although I think the existing allocateFrom(String, Charset) method does have an undocumented dependency, because it uses CharsetKind to get the terminator char length, which only supports standard Charsets. If we add a fast path for UTF-16 that may need a dependency on a standard Charset (or a standard way to get the code unit size of a charset, if it has one).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note sure I follow -- the method you mention says this:

    * @throws IllegalArgumentException if {@code charset} is not a
     *         {@linkplain StandardCharsets standard charset}

What do you mean by "undocumented dependency"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, you're right, it is documented. It's documented differently than e.g. MemorySegment#getString, which mentions it in both the @param and @throws doc.

* the string bytes.
* @param srcIndex the starting character index of the source string
* @param dst the destination segment
* @param dstOffset the starting offset, in bytes, of the destination segment
* @param numChars the number of characters to be copied
* @throws IllegalStateException if the {@linkplain #scope() scope} associated with
* {@code dst} is not {@linkplain Scope#isAlive() alive}
* @throws WrongThreadException if this method is called from a thread {@code T},
* such that {@code dst.isAccessibleBy(T) == false}
* @throws IndexOutOfBoundsException if either {@code srcIndex}, {@code numChars}, or {@code dstOffset}
* are {@code < 0}
* @throws IndexOutOfBoundsException if {@code srcIndex > src.length() - numChars}
* @throws IllegalArgumentException if {@code dst} is {@linkplain #isReadOnly() read-only}
* @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - B} where {@code B} is the size,
* in bytes, of the substring of {@code src} encoded using the given charset
* @return the number of copied bytes.
*/
@ForceInline
static long copy(String src, Charset dstEncoding, int srcIndex, MemorySegment dst, long dstOffset, int numChars) {
Objects.requireNonNull(src);
Objects.requireNonNull(dstEncoding);
Objects.requireNonNull(dst);
Objects.checkFromIndexSize(srcIndex, numChars, src.length());

return AbstractMemorySegmentImpl.copy(src, dstEncoding, srcIndex, dst, dstOffset, numChars);
}

/**
* Finds and returns the relative offset, in bytes, of the first mismatch between the
* source and the destination segments. More specifically, the bytes at offset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ default MemorySegment allocateFrom(String str) {
* If the given string contains any {@code '\0'} characters, they will be
* copied as well. This means that, depending on the method used to read
* the string, such as {@link MemorySegment#getString(long)}, the string
* will appear truncated when read again.
* will appear truncated when read again. The string can be read without
* truncation using {@link MemorySegment#getString(long, Charset, long)}.
*
* @param str the Java string to be converted into a C string
* @param charset the charset used to {@linkplain Charset#newEncoder() encode} the
Expand All @@ -137,10 +138,10 @@ default MemorySegment allocateFrom(String str, Charset charset) {
int termCharSize = StringSupport.CharsetKind.of(charset).terminatorCharSize();
MemorySegment segment;
int length;
if (StringSupport.bytesCompatible(str, charset)) {
if (StringSupport.bytesCompatible(str, charset, 0, str.length())) {
length = str.length();
segment = allocateNoInit((long) length + termCharSize);
StringSupport.copyToSegmentRaw(str, segment, 0);
StringSupport.copyToSegmentRaw(str, segment, 0, 0, str.length());
} else {
byte[] bytes = str.getBytes(charset);
length = bytes.length;
Expand All @@ -153,6 +154,53 @@ default MemorySegment allocateFrom(String str, Charset charset) {
return segment;
}

/**
* Encodes a Java string using the provided charset and stores the resulting
* byte array into a memory segment.
* <p>
* This method always replaces malformed-input and unmappable-character
* sequences with this charset's default replacement byte array. The
* {@link java.nio.charset.CharsetEncoder} class should be used when more
* control over the encoding process is required.
* <p>
* If the given string contains any {@code '\0'} characters, they will be
* copied as well. This means that, depending on the method used to read
* the string, such as {@link MemorySegment#getString(long)}, the string
* will appear truncated when read again. The string can be read without
* truncation using {@link MemorySegment#getString(long, Charset, long)}.
*
* @param str the Java string to be encoded
* @param charset the charset used to {@linkplain Charset#newEncoder() encode} the
* string bytes
* @param srcIndex the starting index of the source string
* @param numChars the number of characters to be copied
* @return a new native segment containing the encoded string
* @throws IndexOutOfBoundsException if either {@code srcIndex} or {@code numChars} are {@code < 0}
* @throws IndexOutOfBoundsException if {@code srcIndex > str.length() - numChars}
*
* @implSpec The default implementation for this method copies the contents of the
* provided Java string into a new memory segment obtained by calling
* {@code this.allocate(B)}, where {@code B} is the size, in bytes, of
* the string encoded using the provided charset
* (e.g. {@code str.getBytes(charset).length});
*/
@ForceInline
default MemorySegment allocateFrom(String str, Charset charset, int srcIndex, int numChars) {
Objects.requireNonNull(charset);
Objects.requireNonNull(str);
Objects.checkFromIndexSize(srcIndex, numChars, str.length());
MemorySegment segment;
if (StringSupport.bytesCompatible(str, charset, srcIndex, numChars)) {
segment = allocateNoInit(numChars);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also seems to rely on the fact that we end up here only for latin1 strings. Again, I don't think this is correct, but if it's deliberate, we should add an assertion check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I think we make a similar assumption in the existing allocateFrom(String, Charset), it does length + termCharSize and that should perhaps be (length + 1) * codeUnitSize.

StringSupport.copyToSegmentRaw(str, segment, 0, srcIndex, numChars);
} else {
byte[] bytes = str.substring(srcIndex, srcIndex + numChars).getBytes(charset);
segment = allocateNoInit(bytes.length);
MemorySegment.copy(bytes, 0, segment, ValueLayout.JAVA_BYTE, 0, bytes.length);
}
return segment;
}

/**
* {@return a new memory segment initialized with the provided byte value}
* <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,10 +612,10 @@ StackWalker newStackWalkerInstance(Set<StackWalker.Option> options,
/**
* Copy the string bytes to an existing segment, avoiding intermediate copies.
*/
void copyToSegmentRaw(String string, MemorySegment segment, long offset);
void copyToSegmentRaw(String string, MemorySegment segment, long offset, int srcIndex, int srcLength);

/**
* Are the string bytes compatible with the given charset?
*/
boolean bytesCompatible(String string, Charset charset);
boolean bytesCompatible(String string, Charset charset, int srcIndex, int numChars);
}
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,13 @@ public boolean equals(Object o) {
unsafeGetOffset() == that.unsafeGetOffset();
}

@Override
public String getString(long offset, Charset charset, long length) {
Utils.checkNonNegativeArgument(length, "length");
Objects.requireNonNull(charset);
return StringSupport.read(this, offset, charset, length);
}

@Override
public int hashCode() {
return Objects.hash(
Expand Down Expand Up @@ -686,6 +693,16 @@ public static void copy(Object srcArray, int srcIndex,
}
}

@ForceInline
public static long copy(String src, Charset dstEncoding, int srcIndex, MemorySegment dst, long dstOffset, int numChars) {
Objects.requireNonNull(src);
Objects.requireNonNull(dstEncoding);
Objects.requireNonNull(dst);

AbstractMemorySegmentImpl destImpl = (AbstractMemorySegmentImpl)dst;
return StringSupport.copyBytes(src, destImpl, dstEncoding, dstOffset, srcIndex, numChars);
}

// accessors

@ForceInline
Expand Down
Loading