Skip to content

Commit

Permalink
Fix #158 - Fix non ascii character handling (#372)
Browse files Browse the repository at this point in the history
  • Loading branch information
jebeaudet committed Jul 10, 2023
1 parent c228b51 commit d975178
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 6 deletions.
26 changes: 20 additions & 6 deletions src/main/java/org/codehaus/mojo/exec/LineRedirectOutputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
* under the License.
*/

import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Objects;
import java.util.function.Consumer;

/**
Expand All @@ -32,11 +35,17 @@
*/
class LineRedirectOutputStream extends OutputStream {

private StringBuilder currentLine = new StringBuilder();
private final Consumer<String> linePrinter;
private final Charset charset;
private ByteArrayOutputStream buffer = new ByteArrayOutputStream();

public LineRedirectOutputStream(Consumer<String> linePrinter) {
this.linePrinter = linePrinter;
this(linePrinter, Charset.defaultCharset());
}

public LineRedirectOutputStream(Consumer<String> linePrinter, Charset charset) {
this.linePrinter = Objects.requireNonNull(linePrinter);
this.charset = Objects.requireNonNull(charset);
}

@Override
Expand All @@ -45,18 +54,23 @@ public void write(final int b) {
printAndReset();
return;
}
currentLine.append((char) b);
buffer.write(b);
}

@Override
public void flush() {
if (currentLine.length() > 0) {
if (buffer.size() > 0) {
printAndReset();
}
}

@Override
public void close() {
flush();
}

private void printAndReset() {
linePrinter.accept(currentLine.toString());
currentLine = new StringBuilder();
linePrinter.accept(new String(buffer.toByteArray(), charset));
buffer = new ByteArrayOutputStream();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package org.codehaus.mojo.exec;

import org.junit.Assert;
import org.junit.Assume;
import org.junit.Test;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.function.Function;

public class LineRedirectOutputStreamTest {

@Test
public void givenExtendedUnicodeCharacterOutput_whenRedirectingWithUtf8Charset_thenShouldDecodeProperly()
throws IOException {
internalTestForCharset(StandardCharsets.UTF_8);
}

@Test
public void givenExtendedUnicodeCharacterOutput_whenRedirectingWithIso8859Charset_thenShouldDecodeProperly()
throws IOException {
internalTestForCharset(StandardCharsets.ISO_8859_1);
}

@Test
public void givenExtendedUnicodeCharacterOutput_whenRedirectingWithCp1252_thenShouldDecodeProperly()
throws IOException {
Assume.assumeTrue("The JVM does not contain the cp-1252 charset", Charset.availableCharsets().containsKey("windows-1252"));
internalTestForCharset(Charset.forName("windows-1252"));
}

@Test
public void givenExtendedUnicodeCharacterOutput_whenRedirectingWithDefaultCharset_thenShouldDecodeProperly()
throws IOException {
internalTestForCharset(Charset.defaultCharset());
}

@Test
public void givenExtendedUnicodeCharacterOutput_whenRedirectingWithCharsetUnspecified_thenShouldDecodeProperly()
throws IOException {
internalTestForCharset(sb -> new LineRedirectOutputStream(sb::append), Charset.defaultCharset());
}

@Test(expected = NullPointerException.class)
public void givenNullCharset_whenInstantiating_thenShouldThrow() {
new LineRedirectOutputStream(new StringBuilder()::append, null);
}

@Test(expected = NullPointerException.class)
public void givenNullStringConsumer_whenInstantiating_thenShouldThrow() {
new LineRedirectOutputStream(null, Charset.defaultCharset());
}

private void internalTestForCharset(Charset charset) throws IOException {
internalTestForCharset(sb -> new LineRedirectOutputStream(sb::append, charset), charset);
}

private void internalTestForCharset(Function<StringBuilder, LineRedirectOutputStream> lineRedirectOutputStream,
Charset charset) throws IOException {
StringBuilder sb = new StringBuilder();
String firstLine = "Hello, 你好, नमस्ते, مرحبا, γεια σας, שלום, こんにちは, 안녕하세요!";
String secondLine = "🌍 Welcome to the world! 🌟✨🎉🔥";
String expectedString = firstLine + secondLine;

try (LineRedirectOutputStream os = lineRedirectOutputStream.apply(sb)) {
os.write(String.join("\n", firstLine, secondLine).getBytes(charset));
}

// The String to bytes to String is required here because StringCoding uses the Charset.defaultCharset()
// internally so it would make the test fail when testing for different charsets.
Assert.assertEquals(new String(expectedString.getBytes(charset), charset), sb.toString());
}
}

6 comments on commit d975178

@mirabilos
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you ought to also test this with a command output that isn’t a valid string in the current charset.

@mirabilos
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because otherwise that will be my next bugreport.

@slawekjaranowski
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you ought to also test this with a command output that isn’t a valid string in the current charset.

PR with such test are welcome. This one is already merged.

@mirabilos
Copy link

@mirabilos mirabilos commented on d975178 Jul 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, maybe put \xFE into a ByteArrayInputStream and test if that arrives properly on the other side?

If, at the same time, you test that m\xC3\xA4h is read as m\u00E4h (or just kept as byte array, not stringified) and ends up as m\xC3\xA4h in the output again, you’re good.

Script output is not bound to a specific string locale, it’s raw octets. Python 3 had to learn this the hard way as well.

@slawekjaranowski
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please create PR with your proposition or at least new issue ... comment on merged commit can be forgotten 😄

@mirabilos
Copy link

@mirabilos mirabilos commented on d975178 Jul 17, 2023 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.