Skip to content

Commit

Permalink
LOAD CSV also detects and uses BOM in file header
Browse files Browse the repository at this point in the history
Previously only Readables.files(File...) did this. Now introducing
Readables.wrap(InputStream) w/ possibility to also specify default charset
if there's no BOM, otherwise the BOM controls the charset.
  • Loading branch information
tinwelint committed Aug 25, 2015
1 parent c6813ac commit 6664f21
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 16 deletions.
5 changes: 5 additions & 0 deletions community/csv/src/main/java/org/neo4j/csv/reader/Magic.java
Expand Up @@ -121,6 +121,11 @@ public static Magic of( byte[] bytes )
return NONE;
}

public static int longest()
{
return LONGEST;
}

private final String description;
private final Charset encoding;
private final byte[] bytes;
Expand Down
32 changes: 32 additions & 0 deletions community/csv/src/main/java/org/neo4j/csv/reader/Readables.java
Expand Up @@ -24,9 +24,11 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.GZIPInputStream;
Expand Down Expand Up @@ -77,6 +79,36 @@ public String sourceDescription()
}
};

public static CharReadable wrap( final InputStream stream, final String sourceName, Charset charset )
throws IOException
{
byte[] bytes = new byte[Magic.longest()];
PushbackInputStream pushbackStream = new PushbackInputStream( stream, bytes.length );
Charset usedCharset = charset;
int read = stream.read( bytes );
if ( read >= 0 )
{
bytes = read < bytes.length ? Arrays.copyOf( bytes, read ) : bytes;
Magic magic = Magic.of( bytes );
int excessiveBytes = read;
if ( magic.impliesEncoding() )
{
// Unread the diff between the BOM and the longest magic we gathered bytes for
excessiveBytes -= magic.length();
usedCharset = magic.encoding();
}
pushbackStream.unread( bytes, read - excessiveBytes, excessiveBytes );
}
return wrap( new InputStreamReader( pushbackStream, usedCharset )
{
@Override
public String toString()
{
return sourceName;
}
} );
}

/**
* Remember that the {@link Reader#toString()} must provide a description of the data source.
*/
Expand Down
62 changes: 52 additions & 10 deletions community/csv/src/test/java/org/neo4j/csv/reader/ReadablesTest.java
Expand Up @@ -23,8 +23,10 @@
import org.junit.Test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
Expand Down Expand Up @@ -161,21 +163,49 @@ public void shouldSkipBOM() throws Exception
// GIVEN
String text = "abcdefghijklmnop";

// THEN/WHEN
shouldReadableTextFromFileWithBom( Magic.BOM_UTF_32_BE, text );
shouldReadableTextFromFileWithBom( Magic.BOM_UTF_32_LE, text );
shouldReadableTextFromFileWithBom( Magic.BOM_UTF_16_BE, text );
shouldReadableTextFromFileWithBom( Magic.BOM_UTF_16_LE, text );
shouldReadableTextFromFileWithBom( Magic.BOM_UTF_8, text );
// WHEN/THEN
shouldReadTextFromFileWithBom( Magic.BOM_UTF_32_BE, text );
shouldReadTextFromFileWithBom( Magic.BOM_UTF_32_LE, text );
shouldReadTextFromFileWithBom( Magic.BOM_UTF_16_BE, text );
shouldReadTextFromFileWithBom( Magic.BOM_UTF_16_LE, text );
shouldReadTextFromFileWithBom( Magic.BOM_UTF_8, text );
}

private void shouldReadableTextFromFileWithBom( Magic bom, String text ) throws IOException
@Test
public void shouldReadTextFromWrappedInputStream() throws Exception
{
// GIVEN
File file = writeToFile( bom.bytes(), text, bom.encoding() );
String text = "abcdefghijklmnop";

// WHEN
File file = writeToFile( text, Charset.defaultCharset() );

// THEN
assertReadText( file, text );
assertReadTextAsInputStream( file, text );
}

@Test
public void shouldSkipBomWhenWrappingInputStream() throws Exception
{
// GIVEN
String text = "abcdefghijklmnop";

// WHEN/THEN
shouldReadTextFromInputStreamWithBom( Magic.BOM_UTF_32_BE, text );
shouldReadTextFromInputStreamWithBom( Magic.BOM_UTF_32_LE, text );
shouldReadTextFromInputStreamWithBom( Magic.BOM_UTF_16_BE, text );
shouldReadTextFromInputStreamWithBom( Magic.BOM_UTF_16_LE, text );
shouldReadTextFromInputStreamWithBom( Magic.BOM_UTF_8, text );
}

private void shouldReadTextFromFileWithBom( Magic bom, String text ) throws IOException
{
assertReadText( writeToFile( bom.bytes(), text, bom.encoding() ), text );
}

private void shouldReadTextFromInputStreamWithBom( Magic bom, String text ) throws IOException
{
assertReadTextAsInputStream( writeToFile( bom.bytes(), text, bom.encoding() ), text );
}

private void shouldComplyWithSpecifiedCharset( Charset charset ) throws Exception
Expand Down Expand Up @@ -261,7 +291,19 @@ private File compressWithGZip( String text ) throws IOException

private void assertReadText( File file, String text ) throws IOException
{
CharReadable readable = Readables.files( Charset.defaultCharset(), file );
assertReadText( Readables.files( Charset.defaultCharset(), file ), text );
}

private void assertReadTextAsInputStream( File file, String text ) throws IOException
{
try ( InputStream stream = new FileInputStream( file ) )
{
assertReadText( Readables.wrap( stream, file.getPath(), Charset.defaultCharset() ), text );
}
}

private void assertReadText( CharReadable readable, String text ) throws IOException
{
SectionedCharBuffer readText = new SectionedCharBuffer( text.toCharArray().length );
readable.read( readText, readText.front() );
assertArrayEquals( text.toCharArray(), copyOfRange( readText.array(), readText.pivot(), readText.front() ) );
Expand Down
Expand Up @@ -26,7 +26,6 @@
import org.neo4j.cypher.internal.compiler.v2_3.codegen.QueryExecutionTracer;
import org.neo4j.cypher.internal.compiler.v2_3.planDescription.InternalPlanDescription;
import org.neo4j.function.Supplier;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.kernel.api.Statement;
import org.neo4j.kernel.impl.core.NodeManager;

Expand Down
Expand Up @@ -21,14 +21,13 @@ package org.neo4j.cypher.internal.compiler.v2_3.spi

import java.io._
import java.net.{CookieHandler, CookieManager, CookiePolicy, URL}

import org.neo4j.csv.reader._
import org.neo4j.cypher.internal.compiler.v2_3.TaskCloser
import org.neo4j.cypher.internal.compiler.v2_3.pipes.ExternalResource
import org.neo4j.cypher.internal.frontend.v2_3.LoadExternalResourceException

import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
import java.nio.charset.Charset

object CSVResources {
val DEFAULT_FIELD_TERMINATOR: Char = ','
Expand All @@ -48,9 +47,7 @@ class CSVResources(cleaner: TaskCloser) extends ExternalResource {

def getCsvIterator(url: URL, fieldTerminator: Option[String] = None): Iterator[Array[String]] = {
val inputStream = openStream(url)
val reader = Readables.wrap(new InputStreamReader(inputStream, "UTF-8") {
override def toString = url.toString
})
val reader = Readables.wrap( inputStream, url.toString(), Charset.forName( "UTF-8" ) )
val delimiter: Char = fieldTerminator.map(_.charAt(0)).getOrElse(CSVResources.DEFAULT_FIELD_TERMINATOR)
val seeker = CharSeekers.charSeeker(reader, CSVResources.defaultConfig, true)
val extractor = new Extractors(delimiter).string()
Expand Down

0 comments on commit 6664f21

Please sign in to comment.