Skip to content

Commit

Permalink
CSV parser do not trim quoted strings
Browse files Browse the repository at this point in the history
Trimming strings can help correct simple general mistakes in CSV data files and header,
but strings that are quoted are typically put there explicitly and should not be
altered by the parser.

The trimming logic has been moved into BufferedCharSeeker Which means it can make
better decisions about trimming when quotes are involved.
Basically trimming now means that whitespace around delimiters are removed
and quotes will be removed if (after potential whitespace have been trimmed)
value starts and ends with quotes. This results in a more predictable trimming,
especially in combination with quoting.

Examples of w/ or w/o trim strings:

- w/o:
  `"a", " b " , c , "d "," e "` -> `|a| " b " | c | "d "| e |`
  here the string is simply chopped up at each delimiter and if the quotes aren't
  precisely the first and last character of the value then they will be treated
  as quote characters.
- w/ BEFORE this commit:
  `"a", " b " , c , "d "," e "` -> `|a|" b "|c|"d "|e|`
  here the quoted values which begin with whitespace before quotes will only
  have those whitespaces trimmed, but keep the quote characters. It will also
  trim a quoted string after removing its quotes if the quotes were precisely
  the first and last characters of the value.
- w/ AFTER this commit:
  `"a", " b " , c , "d "," e "` -> `|a| b |c|d | e |`
  here the whitespace around delimiters are trimmed and quotes are correctly
  recognized and therefore also removed if they are the first and last characters
  after initial whitespace around delimiters have been removed.
  • Loading branch information
tinwelint committed Jan 2, 2018
1 parent e1b9c09 commit ef7e2f2
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 48 deletions.
Expand Up @@ -37,6 +37,7 @@ public class BufferedCharSeeker implements CharSeeker
private static final char EOL_CHAR_2 = '\r'; private static final char EOL_CHAR_2 = '\r';
private static final char EOF_CHAR = (char) -1; private static final char EOF_CHAR = (char) -1;
private static final char BACK_SLASH = '\\'; private static final char BACK_SLASH = '\\';
private static final char WHITESPACE = ' ';


private char[] buffer; private char[] buffer;
private int dataLength; private int dataLength;
Expand All @@ -63,6 +64,7 @@ public class BufferedCharSeeker implements CharSeeker
private final boolean legacyStyleQuoting; private final boolean legacyStyleQuoting;
private final Source source; private final Source source;
private Chunk currentChunk; private Chunk currentChunk;
private final boolean trim;


public BufferedCharSeeker( Source source, Configuration config ) public BufferedCharSeeker( Source source, Configuration config )
{ {
Expand All @@ -71,6 +73,7 @@ public BufferedCharSeeker( Source source, Configuration config )
this.lineStartPos = this.bufferPos; this.lineStartPos = this.bufferPos;
this.multilineFields = config.multilineFields(); this.multilineFields = config.multilineFields();
this.legacyStyleQuoting = config.legacyStyleQuoting(); this.legacyStyleQuoting = config.legacyStyleQuoting();
this.trim = config.trimStrings();
} }


@Override @Override
Expand All @@ -96,12 +99,20 @@ public boolean seek( Mark mark, int untilChar ) throws IOException
ch = nextChar( skippedChars ); ch = nextChar( skippedChars );
if ( quoteDepth == 0 ) if ( quoteDepth == 0 )
{ // In normal mode, i.e. not within quotes { // In normal mode, i.e. not within quotes
if ( ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */ ) if ( isWhitespace( ch ) && trim )
{
if ( seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
{
// We found a whitespace, which was the first of the value and we've been told to trim that off
seekStartPos++;
}
}
else if ( ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
{ // We found a quote, which was the first of the value, skip it and switch mode { // We found a quote, which was the first of the value, skip it and switch mode
quoteDepth++; quoteDepth++;
isQuoted = true;
seekStartPos++; seekStartPos++;
quoteStartLine = lineNumber; quoteStartLine = lineNumber;
continue;
} }
else if ( isNewLine( ch ) ) else if ( isNewLine( ch ) )
{ // Encountered newline, done for now { // Encountered newline, done for now
Expand All @@ -115,28 +126,26 @@ else if ( isNewLine( ch ) )
} }
else if ( ch == untilChar ) else if ( ch == untilChar )
{ // We found a delimiter, set marker and return true { // We found a delimiter, set marker and return true
mark.set( seekStartPos, bufferPos - endOffset - skippedChars, ch, isQuoted ); return setMark( mark, endOffset, skippedChars, ch, isQuoted );
return true; }
else
{ // This is a character to include as part of the current value
if ( isQuoted )
{ // This value is quoted, i.e. started with a quote and has also seen a quote
throw new DataAfterQuoteException( this,
new String( buffer, seekStartPos, bufferPos - seekStartPos ) );
}
} }
} }
else else
{ // In quoted mode, i.e. within quotes { // In quoted mode, i.e. within quotes
isQuoted = true;
if ( ch == quoteChar ) if ( ch == quoteChar )
{ // Found a quote within a quote, peek at next char { // Found a quote within a quote, peek at next char
int nextCh = peekChar( skippedChars ); int nextCh = peekChar( skippedChars );

if ( nextCh == quoteChar ) if ( nextCh == quoteChar )
{ // Found a double quote, skip it and we're going down one more quote depth (quote-in-quote) { // Found a double quote, skip it and we're going down one more quote depth (quote-in-quote)
repositionChar( bufferPos++, ++skippedChars ); repositionChar( bufferPos++, ++skippedChars );
} }
else if ( nextCh != untilChar && !isNewLine( nextCh ) && nextCh != EOF_CHAR )
{ // Found an ending quote of sorts, although the next char isn't a delimiter, newline, or EOF
// so it looks like there's data characters after this end quote. We don't really support that.
// So circle this back to the user saying there's something wrong with the field.
throw new DataAfterQuoteException( this,
new String( buffer, seekStartPos, bufferPos - seekStartPos ) );
}
else else
{ // Found an ending quote, skip it and switch mode { // Found an ending quote, skip it and switch mode
endOffset++; endOffset++;
Expand Down Expand Up @@ -180,10 +189,31 @@ else if ( eof )
// We found the last value of the line or stream // We found the last value of the line or stream
lineNumber++; lineNumber++;
lineStartPos = bufferPos; lineStartPos = bufferPos;
mark.set( seekStartPos, bufferPos - endOffset - skippedChars, END_OF_LINE_CHARACTER, isQuoted ); return setMark( mark, endOffset, skippedChars, END_OF_LINE_CHARACTER, isQuoted );
}

private boolean setMark( Mark mark, int endOffset, int skippedChars, int ch, boolean isQuoted )
{
int pos = (trim ? rtrim( bufferPos ) : bufferPos) - endOffset - skippedChars;
mark.set( seekStartPos, pos, ch, isQuoted );
return true; return true;
} }


private int rtrim( int start )
{
int index = start;
while ( isWhitespace( buffer[index - 1 /*bufferPos has advanced*/ - 1 /*don't check the last read char (delim or EOF)*/] ) )
{
index--;
}
return index;
}

private boolean isWhitespace( int ch )
{
return ch == WHITESPACE;
}

private void repositionChar( int offset, int stepsBack ) private void repositionChar( int offset, int stepsBack )
{ {
// We reposition characters because we might have skipped some along the way, double-quotes and what not. // We reposition characters because we might have skipped some along the way, double-quotes and what not.
Expand Down
Expand Up @@ -39,10 +39,10 @@ public interface Extractor<T> extends Cloneable
* @param data characters in a buffer. * @param data characters in a buffer.
* @param offset offset into the buffer where the value starts. * @param offset offset into the buffer where the value starts.
* @param length number of characters from the offset to extract. * @param length number of characters from the offset to extract.
* @param skippedChars whether or not there were skipped characters, f.ex. quotation. * @param hadQuotes whether or not there were skipped characters, f.ex. quotation.
* @return {@code true} if a value was extracted, otherwise {@code false}. * @return {@code true} if a value was extracted, otherwise {@code false}.
*/ */
boolean extract( char[] data, int offset, int length, boolean skippedChars ); boolean extract( char[] data, int offset, int length, boolean hadQuotes );


/** /**
* @return the most recently extracted value. * @return the most recently extracted value.
Expand Down
22 changes: 8 additions & 14 deletions community/csv/src/main/java/org/neo4j/csv/reader/Extractors.java
Expand Up @@ -115,7 +115,7 @@ public Extractors( char arrayDelimiter, boolean emptyStringsAsNull, boolean trim
} }
} }


add( string = new StringExtractor( emptyStringsAsNull, trimStrings ) ); add( string = new StringExtractor( emptyStringsAsNull ) );
add( long_ = new LongExtractor() ); add( long_ = new LongExtractor() );
add( int_ = new IntExtractor() ); add( int_ = new IntExtractor() );
add( char_ = new CharExtractor() ); add( char_ = new CharExtractor() );
Expand Down Expand Up @@ -277,17 +277,17 @@ private abstract static class AbstractSingleValueExtractor<T> extends AbstractEx
} }


@Override @Override
public final boolean extract( char[] data, int offset, int length, boolean skippedChars ) public final boolean extract( char[] data, int offset, int length, boolean hadQuotes )
{ {
if ( nullValue( length, skippedChars ) ) if ( nullValue( length, hadQuotes ) )
{ {
clear(); clear();
return false; return false;
} }
return extract0( data, offset, length ); return extract0( data, offset, length );
} }


protected boolean nullValue( int length, boolean skippedChars ) protected boolean nullValue( int length, boolean hadQuotes )
{ {
return length == 0; return length == 0;
} }
Expand All @@ -301,13 +301,11 @@ public static class StringExtractor extends AbstractSingleValueExtractor<String>
{ {
private String value; private String value;
private final boolean emptyStringsAsNull; private final boolean emptyStringsAsNull;
private final boolean trimStrings;


public StringExtractor( boolean emptyStringsAsNull, boolean trimStrings ) public StringExtractor( boolean emptyStringsAsNull )
{ {
super( String.class.getSimpleName() ); super( String.class.getSimpleName() );
this.emptyStringsAsNull = emptyStringsAsNull; this.emptyStringsAsNull = emptyStringsAsNull;
this.trimStrings = trimStrings;
} }


@Override @Override
Expand All @@ -317,19 +315,15 @@ protected void clear()
} }


@Override @Override
protected boolean nullValue( int length, boolean skippedChars ) protected boolean nullValue( int length, boolean hadQuotes )
{ {
return length == 0 && (!skippedChars || emptyStringsAsNull); return length == 0 && (!hadQuotes || emptyStringsAsNull);
} }


@Override @Override
protected boolean extract0( char[] data, int offset, int length ) protected boolean extract0( char[] data, int offset, int length )
{ {
value = new String( data, offset, length ); value = new String( data, offset, length );
if (trimStrings)
{
value = value.trim();
}
return true; return true;
} }


Expand Down Expand Up @@ -675,7 +669,7 @@ public T value()
} }


@Override @Override
public boolean extract( char[] data, int offset, int length, boolean skippedChars ) public boolean extract( char[] data, int offset, int length, boolean hadQuotes )
{ {
extract0( data, offset, length ); extract0( data, offset, length );
return true; return true;
Expand Down
4 changes: 2 additions & 2 deletions community/csv/src/main/java/org/neo4j/csv/reader/Mark.java
Expand Up @@ -36,8 +36,8 @@ public class Mark
private boolean quoted; private boolean quoted;


/** /**
* @param startPosition * @param startPosition position of first character in value (inclusive).
* @param position * @param position position of last character in value (exclusive).
* @param character use {@code -1} to denote that the matching character was an end-of-line or end-of-file * @param character use {@code -1} to denote that the matching character was an end-of-line or end-of-file
* @param quoted whether or not the original data was quoted. * @param quoted whether or not the original data was quoted.
*/ */
Expand Down
Expand Up @@ -643,6 +643,26 @@ private void shouldParseMultilineFieldWhereEndQuoteIsOnItsOwnLine( String newlin
"Quux" ) ); "Quux" ) );
} }


@Test
public void shouldTrimWhitespace() throws Exception
{
// given
String data = lines( "\n",
"Foo, Bar, Twobar , \"Baz\" , \" Quux \",\"Wiii \" , Waaaa " );

// when
seeker = seeker( data, withTrimStrings( config(), true ) );

// then
assertNextValue( seeker, mark, COMMA, "Foo" );
assertNextValue( seeker, mark, COMMA, "Bar" );
assertNextValue( seeker, mark, COMMA, "Twobar" );
assertNextValue( seeker, mark, COMMA, "Baz" );
assertNextValue( seeker, mark, COMMA, " Quux " );
assertNextValue( seeker, mark, COMMA, "Wiii " );
assertNextValue( seeker, mark, COMMA, "Waaaa" );
}

private String lines( String newline, String... lines ) private String lines( String newline, String... lines )
{ {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
Expand Down Expand Up @@ -832,6 +852,18 @@ public char quotationCharacter()
}; };
} }


private static Configuration withTrimStrings( Configuration config, boolean trimStrings )
{
return new Configuration.Overridden( config )
{
@Override
public boolean trimStrings()
{
return trimStrings;
}
};
}

private static final int TAB = '\t'; private static final int TAB = '\t';
private static final int COMMA = ','; private static final int COMMA = ',';
private static final Random random = new Random(); private static final Random random = new Random();
Expand Down
Expand Up @@ -224,35 +224,35 @@ public void shouldExtractNullForEmptyQuotedStringIfConfiguredTo() throws Excepti
} }


@Test @Test
public void shouldTrimStringIfConfiguredTo() throws Exception public void shouldTrimStringArrayIfConfiguredTo() throws Exception
{ {
// GIVEN // GIVEN
Extractors extractors = new Extractors( ',', true, true); Extractors extractors = new Extractors( ';', true, true );
String value = " abcde fgh "; String value = "ab;cd ; ef; gh ";


// WHEN // WHEN
char[] asChars = value.toCharArray(); char[] asChars = value.toCharArray();
Extractor<String> extractor = extractors.string(); Extractor<String[]> extractor = extractors.stringArray();
extractor.extract( asChars, 0, asChars.length, true ); extractor.extract( asChars, 0, asChars.length, true );


// THEN // THEN
assertEquals( value.trim(), extractor.value() ); assertArrayEquals( new String[] {"ab", "cd", "ef", "gh"}, extractor.value() );
} }


@Test @Test
public void shouldNotTrimStringIfNotConfiguredTo() throws Exception public void shouldNotTrimStringIfNotConfiguredTo() throws Exception
{ {
// GIVEN // GIVEN
Extractors extractors = new Extractors( ',', true, false); Extractors extractors = new Extractors( ';', true, false );
String value = " abcde fgh "; String value = "ab;cd ; ef; gh ";


// WHEN // WHEN
char[] asChars = value.toCharArray(); char[] asChars = value.toCharArray();
Extractor<String> extractor = extractors.string(); Extractor<String[]> extractor = extractors.stringArray();
extractor.extract( asChars, 0, asChars.length, true ); extractor.extract( asChars, 0, asChars.length, true );


// THEN // THEN
assertEquals( value, extractor.value() ); assertArrayEquals( new String[] {"ab", "cd ", " ef", " gh "}, extractor.value() );
} }


@Test @Test
Expand Down
Expand Up @@ -31,6 +31,7 @@
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
Expand Down Expand Up @@ -1193,7 +1194,10 @@ public void shouldTrimStringsIfConfiguredTo() throws Exception
{ {
// GIVEN // GIVEN
String name = " This is a line with leading and trailing whitespaces "; String name = " This is a line with leading and trailing whitespaces ";
File data = data( ":ID,name", "1,\"" + name + "\""); File data = data(
":ID,name",
"1,\"" + name + "\"",
"2," + name );


// WHEN // WHEN
importTool( importTool(
Expand All @@ -1203,13 +1207,18 @@ public void shouldTrimStringsIfConfiguredTo() throws Exception


// THEN // THEN
GraphDatabaseService db = dbRule.getGraphDatabaseAPI(); GraphDatabaseService db = dbRule.getGraphDatabaseAPI();
try ( Transaction tx = db.beginTx() ) try ( Transaction tx = db.beginTx();
ResourceIterator<Node> allNodes = db.getAllNodes().iterator() )
{ {
ResourceIterator<Node> allNodes = db.getAllNodes().iterator(); Set<String> names = new HashSet<>();
Node node = Iterators.single( allNodes ); while ( allNodes.hasNext() )
allNodes.close(); {
names.add( allNodes.next().getProperty( "name" ).toString() );
}


assertEquals( name.trim(), node.getProperty( "name" ) ); assertTrue( names.remove( name ) );
assertTrue( names.remove( name.trim() ) );
assertTrue( names.isEmpty() );


tx.success(); tx.success();
} }
Expand Down
Expand Up @@ -47,7 +47,7 @@ public class InputEntityDeserializer<ENTITY extends InputEntity> extends InputIt
private final Decorator<ENTITY> decorator; private final Decorator<ENTITY> decorator;
private final Deserialization<ENTITY> deserialization; private final Deserialization<ENTITY> deserialization;
private final Validator<ENTITY> validator; private final Validator<ENTITY> validator;
private final Extractors.StringExtractor stringExtractor = new Extractors.StringExtractor( false, false ); private final Extractors.StringExtractor stringExtractor = new Extractors.StringExtractor( false );
private final Collector badCollector; private final Collector badCollector;


InputEntityDeserializer( Header header, CharSeeker data, int delimiter, InputEntityDeserializer( Header header, CharSeeker data, int delimiter,
Expand Down

0 comments on commit ef7e2f2

Please sign in to comment.