Skip to content

Commit

Permalink
Fixes a CSV trim issue with fields containing only whitespace
Browse files Browse the repository at this point in the history
This could result in StringIndexOutOfBounds with negative values
for some fields containing nothing but whitespace characters.
  • Loading branch information
tinwelint committed Mar 27, 2018
1 parent 4cb57de commit 8c7de5a
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 14 deletions.
Expand Up @@ -194,15 +194,21 @@ else if ( eof )


private boolean setMark( Mark mark, int endOffset, int skippedChars, int ch, boolean isQuoted ) private boolean setMark( Mark mark, int endOffset, int skippedChars, int ch, boolean isQuoted )
{ {
int pos = (trim ? rtrim( bufferPos ) : bufferPos) - endOffset - skippedChars; int pos = (trim ? rtrim() : bufferPos) - endOffset - skippedChars;
mark.set( seekStartPos, pos, ch, isQuoted ); mark.set( seekStartPos, pos, ch, isQuoted );
return true; return true;
} }


private int rtrim( int start ) /**
* Starting from the current position, {@link #bufferPos}, scan backwards as long as whitespace is found.
* Although it cannot scan further back than the start of this field is, i.e. {@link #seekStartPos}.
*
* @return the right index of the value to pass into {@link Mark}. This is only called if {@link Configuration#trimStrings()} is {@code true}.
*/
private int rtrim()
{ {
int index = start; int index = bufferPos;
while ( isWhitespace( buffer[index - 1 /*bufferPos has advanced*/ - 1 /*don't check the last read char (delim or EOF)*/] ) ) while ( index - 1 > seekStartPos && isWhitespace( buffer[index - 1 /*bufferPos has advanced*/ - 1 /*don't check the last read char (delim or EOF)*/] ) )
{ {
index--; index--;
} }
Expand Down
Expand Up @@ -19,6 +19,7 @@
*/ */
package org.neo4j.csv.reader; package org.neo4j.csv.reader;


import org.apache.commons.lang3.StringUtils;
import org.junit.After; import org.junit.After;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
Expand All @@ -30,6 +31,7 @@
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;


Expand All @@ -42,6 +44,7 @@
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
import static org.neo4j.csv.reader.CharSeekers.charSeeker; import static org.neo4j.csv.reader.CharSeekers.charSeeker;
import static org.neo4j.csv.reader.Readables.wrap; import static org.neo4j.csv.reader.Readables.wrap;
import static org.neo4j.helpers.collection.Iterators.array;


@RunWith( Parameterized.class ) @RunWith( Parameterized.class )
public class BufferedCharSeekerTest public class BufferedCharSeekerTest
Expand Down Expand Up @@ -272,10 +275,7 @@ public void shouldSkipEmptyLastValue() throws Exception
assertNextValue( seeker, mark, COMMA, "dos" ); assertNextValue( seeker, mark, COMMA, "dos" );
assertNextValue( seeker, mark, COMMA, "tres" ); assertNextValue( seeker, mark, COMMA, "tres" );
assertNextValueNotExtracted( seeker, mark, COMMA ); assertNextValueNotExtracted( seeker, mark, COMMA );
assertTrue( mark.isEndOfLine() ); assertEnd( seeker, mark, COMMA );

// THEN
assertFalse( seeker.seek( mark, COMMA ) );
} }


@Test @Test
Expand Down Expand Up @@ -436,8 +436,7 @@ public void shouldRecognizeStrayQuoteCharacters() throws Exception
assertNextValue( seeker, mark, COMMA, "four" ); assertNextValue( seeker, mark, COMMA, "four" );
assertNextValue( seeker, mark, COMMA, "five" ); assertNextValue( seeker, mark, COMMA, "five" );
assertNextValue( seeker, mark, COMMA, "s\"ix" ); assertNextValue( seeker, mark, COMMA, "s\"ix" );
assertTrue( mark.isEndOfLine() ); assertEnd( seeker, mark, COMMA );
assertFalse( seeker.seek( mark, COMMA ) );
} }


@Test @Test
Expand All @@ -457,8 +456,7 @@ public void shouldNotMisinterpretUnfilledRead() throws Exception
assertNextValue( seeker, mark, COMMA, "abc" ); assertNextValue( seeker, mark, COMMA, "abc" );
assertNextValue( seeker, mark, COMMA, "def" ); assertNextValue( seeker, mark, COMMA, "def" );
assertNextValue( seeker, mark, COMMA, "ghi" ); assertNextValue( seeker, mark, COMMA, "ghi" );
assertTrue( mark.isEndOfLine() ); assertEnd( seeker, mark, COMMA );
assertFalse( seeker.seek( mark, COMMA ) );
} }


@Test @Test
Expand Down Expand Up @@ -526,8 +524,7 @@ public void shouldListenToMusic() throws Exception
assertNextValue( seeker, mark, COMMA, "4" ); assertNextValue( seeker, mark, COMMA, "4" );
assertNextValue( seeker, mark, COMMA, "The Cardigans" ); assertNextValue( seeker, mark, COMMA, "The Cardigans" );
assertNextValue( seeker, mark, COMMA, "1992" ); assertNextValue( seeker, mark, COMMA, "1992" );
assertTrue( mark.isEndOfLine() ); assertEnd( seeker, mark, COMMA );
assertFalse( seeker.seek( mark, COMMA ) );
} }


@Test @Test
Expand Down Expand Up @@ -663,6 +660,141 @@ public void shouldTrimWhitespace() throws Exception
assertNextValue( seeker, mark, COMMA, "Waaaa" ); assertNextValue( seeker, mark, COMMA, "Waaaa" );
} }


@Test
public void shouldTrimStringsWithFirstLineCharacterSpace() throws IOException
{
// given
String line = " ,a, ,b, ";
seeker = seeker( line, withTrimStrings( config(), true ) );

// when/then
assertNextValueNotExtracted( seeker, mark, COMMA );
assertNextValue( seeker, mark, COMMA, "a" );
assertNextValueNotExtracted( seeker, mark, COMMA );
assertNextValue( seeker, mark, COMMA, "b" );
assertNextValueNotExtracted( seeker, mark, COMMA );
assertEnd( seeker, mark, COMMA );
}

@Test
public void shouldParseAndTrimRandomStrings() throws IOException
{
// given
StringBuilder builder = new StringBuilder();
int columns = random.nextInt( 10 ) + 5;
int lines = 100;
List<String> expected = new ArrayList<>();
for ( int i = 0; i < lines; i++ )
{
for ( int j = 0; j < columns; j++ )
{
if ( j > 0 )
{
if ( random.nextBoolean() )
{
// Space before delimiter
builder.append( " " );
}
builder.append( "," );
if ( random.nextBoolean() )
{
// Space before delimiter
builder.append( " " );
}
}
boolean quote = random.nextBoolean();
if ( random.nextBoolean() )
{
String value = "";
if ( quote )
{
// Quote
if ( random.nextBoolean() )
{
// Space after quote start
value += " ";
}
}
// Actual value
value += String.valueOf( random.nextInt() );
if ( quote )
{
if ( random.nextBoolean() )
{
// Space before quote end
value += " ";
}
}
expected.add( value );
builder.append( quote ? "\"" + value + "\"" : value );
}
else
{
expected.add( null );
}
}
builder.append( format( "%n" ) );
}
String data = builder.toString();
seeker = seeker( data, withTrimStrings( config(), true ) );

// when
Iterator<String> next = expected.iterator();
for ( int i = 0; i < lines; i++ )
{
for ( int j = 0; j < columns; j++ )
{
// then
String nextExpected = next.next();
if ( nextExpected == null )
{
assertNextValueNotExtracted( seeker, mark, COMMA );
}
else
{
assertNextValue( seeker, mark, COMMA, nextExpected );
}
}
}
assertEnd( seeker, mark, COMMA );
}

@Test
public void shouldParseNonLatinCharacters() throws IOException
{
// given
List<String[]> expected = asList(
array( "普通�?/普通話", "\uD83D\uDE21" ),
array( "\uD83D\uDE21\uD83D\uDCA9\uD83D\uDC7B", "ⲹ楡�?톜ഷۢ⼈�?�늉�?�₭샺ጚ砧攡跿家䯶�?⬖�?�犽ۼ" ),
array( " 㺂�?鋦毠", ";먵�?裬岰鷲趫\uA8C5얱㓙髿ᚳᬼ≩�?� " )
);
String data = lines( format( "%n" ), expected );

// when
seeker = seeker( data );

// then
for ( String[] line : expected )
{
for ( String cell : line )
{
assertNextValue( seeker, mark, COMMA, cell );
}
}
assertEnd( seeker, mark, COMMA );
}

private String lines( String newline, List<String[]> cells )
{
String[] lines = new String[cells.size()];
int i = 0;
for ( String[] columns : cells )
{
lines[i++] = StringUtils.join( columns, "," );
}
return lines( newline, lines );
}

private String lines( String newline, String... lines ) private String lines( String newline, String... lines )
{ {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
Expand Down Expand Up @@ -753,6 +885,12 @@ private void assertNextValueNotExtracted( CharSeeker seeker, Mark mark, int deli
assertFalse( seeker.tryExtract( mark, extractors.string() ) ); assertFalse( seeker.tryExtract( mark, extractors.string() ) );
} }


private void assertEnd( CharSeeker seeker, Mark mark, int delimiter ) throws IOException
{
assertTrue( mark.isEndOfLine() );
assertFalse( seeker.seek( mark, delimiter ) );
}

private String[] nextLineOfAllStrings( CharSeeker seeker, Mark mark ) throws IOException private String[] nextLineOfAllStrings( CharSeeker seeker, Mark mark ) throws IOException
{ {
List<String> line = new ArrayList<>(); List<String> line = new ArrayList<>();
Expand Down

0 comments on commit 8c7de5a

Please sign in to comment.