CSV parser do not trim quoted strings

Trimming strings can help correct simple general mistakes in CSV data files and header, but strings that are quoted are typically put there explicitly and should not be altered by the parser. The trimming logic has been moved into BufferedCharSeeker Which means it can make better decisions about trimming when quotes are involved. Basically trimming now means that whitespace around delimiters are removed and quotes will be removed if (after potential whitespace have been trimmed) value starts and ends with quotes. This results in a more predictable trimming, especially in combination with quoting. Examples of w/ or w/o trim strings: - w/o: `"a", " b " , c , "d "," e "` -> `|a| " b " | c | "d "| e |` here the string is simply chopped up at each delimiter and if the quotes aren't precisely the first and last character of the value then they will be treated as quote characters. - w/ BEFORE this commit: `"a", " b " , c , "d "," e "` -> `|a|" b "|c|"d "|e|` here the quoted values which begin with whitespace before quotes will only have those whitespaces trimmed, but keep the quote characters. It will also trim a quoted string after removing its quotes if the quotes were precisely the first and last characters of the value. - w/ AFTER this commit: `"a", " b " , c , "d "," e "` -> `|a| b |c|d | e |` here the whitespace around delimiters are trimmed and quotes are correctly recognized and therefore also removed if they are the first and last characters after initial whitespace around delimiters have been removed.
neo4j · Jan 2, 2018 · ef7e2f2 · ef7e2f2
1 parent e1b9c09
commit ef7e2f2
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 48 deletions.
diff --git a/community/csv/src/main/java/org/neo4j/csv/reader/BufferedCharSeeker.java b/community/csv/src/main/java/org/neo4j/csv/reader/BufferedCharSeeker.java
@@ -37,6 +37,7 @@ public class BufferedCharSeeker implements CharSeeker
     private static final char EOL_CHAR_2 = '\r';
     private static final char EOF_CHAR = (char) -1;
     private static final char BACK_SLASH = '\\';
+    private static final char WHITESPACE = ' ';
 
     private char[] buffer;
     private int dataLength;
@@ -63,6 +64,7 @@ public class BufferedCharSeeker implements CharSeeker
     private final boolean legacyStyleQuoting;
     private final Source source;
     private Chunk currentChunk;
+    private final boolean trim;
 
     public BufferedCharSeeker( Source source, Configuration config )
     {
@@ -71,6 +73,7 @@ public BufferedCharSeeker( Source source, Configuration config )
         this.lineStartPos = this.bufferPos;
         this.multilineFields = config.multilineFields();
         this.legacyStyleQuoting = config.legacyStyleQuoting();
+        this.trim = config.trimStrings();
     }
 
     @Override
@@ -96,12 +99,20 @@ public boolean seek( Mark mark, int untilChar ) throws IOException
             ch = nextChar( skippedChars );
             if ( quoteDepth == 0 )
             {   // In normal mode, i.e. not within quotes
-                if ( ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
+                if ( isWhitespace( ch ) && trim )
+                {
+                    if ( seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
+                    {
+                        // We found a whitespace, which was the first of the value and we've been told to trim that off
+                        seekStartPos++;
+                    }
+                }
+                else if ( ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
                 {   // We found a quote, which was the first of the value, skip it and switch mode
                     quoteDepth++;
+                    isQuoted = true;
                     seekStartPos++;
                     quoteStartLine = lineNumber;
-                    continue;
                 }
                 else if ( isNewLine( ch ) )
                 {   // Encountered newline, done for now
@@ -115,28 +126,26 @@ else if ( isNewLine( ch ) )
                 }
                 else if ( ch == untilChar )
                 {   // We found a delimiter, set marker and return true
-                    mark.set( seekStartPos, bufferPos - endOffset - skippedChars, ch, isQuoted );
+                    return setMark( mark, endOffset, skippedChars, ch, isQuoted );
-                    return true;
+                }
+                else
+                {   // This is a character to include as part of the current value
+                    if ( isQuoted )
+                    {   // This value is quoted, i.e. started with a quote and has also seen a quote
+                        throw new DataAfterQuoteException( this,
+                                new String( buffer, seekStartPos, bufferPos - seekStartPos ) );
+                    }
                 }
             }
             else
             {   // In quoted mode, i.e. within quotes
-                isQuoted = true;
                 if ( ch == quoteChar )
                 {   // Found a quote within a quote, peek at next char
                     int nextCh = peekChar( skippedChars );
-
                     if ( nextCh == quoteChar )
                     {   // Found a double quote, skip it and we're going down one more quote depth (quote-in-quote)
                         repositionChar( bufferPos++, ++skippedChars );
                     }
-                    else if ( nextCh != untilChar && !isNewLine( nextCh ) && nextCh != EOF_CHAR )
-                    {   // Found an ending quote of sorts, although the next char isn't a delimiter, newline, or EOF
-                        // so it looks like there's data characters after this end quote. We don't really support that.
-                        // So circle this back to the user saying there's something wrong with the field.
-                        throw new DataAfterQuoteException( this,
-                                new String( buffer, seekStartPos, bufferPos - seekStartPos ) );
-                    }
                     else
                     {   // Found an ending quote, skip it and switch mode
                         endOffset++;
@@ -180,10 +189,31 @@ else if ( eof )
         // We found the last value of the line or stream
         lineNumber++;
         lineStartPos = bufferPos;
-        mark.set( seekStartPos, bufferPos - endOffset - skippedChars, END_OF_LINE_CHARACTER, isQuoted );
+        return setMark( mark, endOffset, skippedChars, END_OF_LINE_CHARACTER, isQuoted );
+    }
+
+    private boolean setMark( Mark mark, int endOffset, int skippedChars, int ch, boolean isQuoted )
+    {
+        int pos = (trim ? rtrim( bufferPos ) : bufferPos) - endOffset - skippedChars;
+        mark.set( seekStartPos, pos, ch, isQuoted );
         return true;
     }
 
+    private int rtrim( int start )
+    {
+        int index = start;
+        while ( isWhitespace( buffer[index - 1 /*bufferPos has advanced*/ - 1 /*don't check the last read char (delim or EOF)*/] ) )
+        {
+            index--;
+        }
+        return index;
+    }
+
+    private boolean isWhitespace( int ch )
+    {
+        return ch == WHITESPACE;
+    }
+
     private void repositionChar( int offset, int stepsBack )
     {
         // We reposition characters because we might have skipped some along the way, double-quotes and what not.

diff --git a/community/csv/src/main/java/org/neo4j/csv/reader/Extractor.java b/community/csv/src/main/java/org/neo4j/csv/reader/Extractor.java
@@ -39,10 +39,10 @@ public interface Extractor<T> extends Cloneable
      * @param data characters in a buffer.
      * @param offset offset into the buffer where the value starts.
      * @param length number of characters from the offset to extract.
-     * @param skippedChars whether or not there were skipped characters, f.ex. quotation.
+     * @param hadQuotes whether or not there were skipped characters, f.ex. quotation.
      * @return {@code true} if a value was extracted, otherwise {@code false}.
      */
-    boolean extract( char[] data, int offset, int length, boolean skippedChars );
+    boolean extract( char[] data, int offset, int length, boolean hadQuotes );
 
     /**
      * @return the most recently extracted value.

diff --git a/community/csv/src/main/java/org/neo4j/csv/reader/Extractors.java b/community/csv/src/main/java/org/neo4j/csv/reader/Extractors.java
@@ -115,7 +115,7 @@ public Extractors( char arrayDelimiter, boolean emptyStringsAsNull, boolean trim
                 }
             }
 
-            add( string = new StringExtractor( emptyStringsAsNull, trimStrings ) );
+            add( string = new StringExtractor( emptyStringsAsNull ) );
             add( long_ = new LongExtractor() );
             add( int_ = new IntExtractor() );
             add( char_ = new CharExtractor() );
@@ -277,17 +277,17 @@ private abstract static class AbstractSingleValueExtractor<T> extends AbstractEx
         }
 
         @Override
-        public final boolean extract( char[] data, int offset, int length, boolean skippedChars )
+        public final boolean extract( char[] data, int offset, int length, boolean hadQuotes )
         {
-            if ( nullValue( length, skippedChars ) )
+            if ( nullValue( length, hadQuotes ) )
             {
                 clear();
                 return false;
             }
             return extract0( data, offset, length );
         }
 
-        protected boolean nullValue( int length, boolean skippedChars )
+        protected boolean nullValue( int length, boolean hadQuotes )
         {
             return length == 0;
         }
@@ -301,13 +301,11 @@ public static class StringExtractor extends AbstractSingleValueExtractor<String>
     {
         private String value;
         private final boolean emptyStringsAsNull;
-        private final boolean trimStrings;
 
-        public StringExtractor( boolean emptyStringsAsNull, boolean trimStrings )
+        public StringExtractor( boolean emptyStringsAsNull )
         {
             super( String.class.getSimpleName() );
             this.emptyStringsAsNull = emptyStringsAsNull;
-            this.trimStrings = trimStrings;
         }
 
         @Override
@@ -317,19 +315,15 @@ protected void clear()
         }
 
         @Override
-        protected boolean nullValue( int length, boolean skippedChars )
+        protected boolean nullValue( int length, boolean hadQuotes )
         {
-            return length == 0 && (!skippedChars || emptyStringsAsNull);
+            return length == 0 && (!hadQuotes || emptyStringsAsNull);
         }
 
         @Override
         protected boolean extract0( char[] data, int offset, int length )
         {
             value = new String( data, offset, length );
-            if (trimStrings)
-            {
-                value = value.trim();
-            }
             return true;
         }
 
@@ -675,7 +669,7 @@ public T value()
         }
 
         @Override
-        public boolean extract( char[] data, int offset, int length, boolean skippedChars )
+        public boolean extract( char[] data, int offset, int length, boolean hadQuotes )
         {
             extract0( data, offset, length );
             return true;

diff --git a/community/csv/src/main/java/org/neo4j/csv/reader/Mark.java b/community/csv/src/main/java/org/neo4j/csv/reader/Mark.java
@@ -36,8 +36,8 @@ public class Mark
     private boolean quoted;
 
     /**
-     * @param startPosition
+     * @param startPosition position of first character in value (inclusive).
-     * @param position
+     * @param position position of last character in value (exclusive).
      * @param character use {@code -1} to denote that the matching character was an end-of-line or end-of-file
      * @param quoted whether or not the original data was quoted.
      */

diff --git a/community/csv/src/test/java/org/neo4j/csv/reader/BufferedCharSeekerTest.java b/community/csv/src/test/java/org/neo4j/csv/reader/BufferedCharSeekerTest.java
@@ -643,6 +643,26 @@ private void shouldParseMultilineFieldWhereEndQuoteIsOnItsOwnLine( String newlin
                 "Quux" ) );
     }
 
+    @Test
+    public void shouldTrimWhitespace() throws Exception
+    {
+        // given
+        String data = lines( "\n",
+                "Foo, Bar,  Twobar , \"Baz\" , \" Quux \",\"Wiii \" , Waaaa  " );
+
+        // when
+        seeker = seeker( data, withTrimStrings( config(), true ) );
+
+        // then
+        assertNextValue( seeker, mark, COMMA, "Foo" );
+        assertNextValue( seeker, mark, COMMA, "Bar" );
+        assertNextValue( seeker, mark, COMMA, "Twobar" );
+        assertNextValue( seeker, mark, COMMA, "Baz" );
+        assertNextValue( seeker, mark, COMMA, " Quux " );
+        assertNextValue( seeker, mark, COMMA, "Wiii " );
+        assertNextValue( seeker, mark, COMMA, "Waaaa" );
+    }
+
     private String lines( String newline, String... lines )
     {
         StringBuilder builder = new StringBuilder();
@@ -832,6 +852,18 @@ public char quotationCharacter()
         };
     }
 
+    private static Configuration withTrimStrings( Configuration config, boolean trimStrings )
+    {
+        return new Configuration.Overridden( config )
+        {
+            @Override
+            public boolean trimStrings()
+            {
+                return trimStrings;
+            }
+        };
+    }
+
     private static final int TAB = '\t';
     private static final int COMMA = ',';
     private static final Random random = new Random();

diff --git a/community/csv/src/test/java/org/neo4j/csv/reader/ExtractorsTest.java b/community/csv/src/test/java/org/neo4j/csv/reader/ExtractorsTest.java
@@ -224,35 +224,35 @@ public void shouldExtractNullForEmptyQuotedStringIfConfiguredTo() throws Excepti
     }
 
     @Test
-    public void shouldTrimStringIfConfiguredTo() throws Exception
+    public void shouldTrimStringArrayIfConfiguredTo() throws Exception
     {
         // GIVEN
-        Extractors extractors = new Extractors( ',', true, true);
+        Extractors extractors = new Extractors( ';', true, true );
-        String value = " abcde fgh  ";
+        String value = "ab;cd ; ef; gh ";
 
         // WHEN
         char[] asChars = value.toCharArray();
-        Extractor<String> extractor = extractors.string();
+        Extractor<String[]> extractor = extractors.stringArray();
         extractor.extract( asChars, 0, asChars.length, true );
 
         // THEN
-        assertEquals( value.trim(), extractor.value() );
+        assertArrayEquals( new String[] {"ab", "cd", "ef", "gh"}, extractor.value() );
     }
 
     @Test
     public void shouldNotTrimStringIfNotConfiguredTo() throws Exception
     {
         // GIVEN
-        Extractors extractors = new Extractors( ',', true, false);
+        Extractors extractors = new Extractors( ';', true, false );
-        String value = " abcde fgh  ";
+        String value = "ab;cd ; ef; gh ";
 
         // WHEN
         char[] asChars = value.toCharArray();
-        Extractor<String> extractor = extractors.string();
+        Extractor<String[]> extractor = extractors.stringArray();
         extractor.extract( asChars, 0, asChars.length, true );
 
         // THEN
-        assertEquals( value, extractor.value() );
+        assertArrayEquals( new String[] {"ab", "cd ", " ef", " gh "}, extractor.value() );
     }
 
     @Test

diff --git a/community/import-tool/src/test/java/org/neo4j/tooling/ImportToolTest.java b/community/import-tool/src/test/java/org/neo4j/tooling/ImportToolTest.java
@@ -31,6 +31,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -1193,7 +1194,10 @@ public void shouldTrimStringsIfConfiguredTo() throws Exception
     {
         // GIVEN
         String name = "  This is a line with leading and trailing whitespaces   ";
-        File data = data( ":ID,name", "1,\"" + name + "\"");
+        File data = data(
+                ":ID,name",
+                "1,\"" + name + "\"",
+                "2," + name );
 
         // WHEN
         importTool(
@@ -1203,13 +1207,18 @@ public void shouldTrimStringsIfConfiguredTo() throws Exception
 
         // THEN
         GraphDatabaseService db = dbRule.getGraphDatabaseAPI();
-        try ( Transaction tx = db.beginTx() )
+        try ( Transaction tx = db.beginTx();
+              ResourceIterator<Node> allNodes = db.getAllNodes().iterator() )
         {
-            ResourceIterator<Node> allNodes = db.getAllNodes().iterator();
+            Set<String> names = new HashSet<>();
-            Node node = Iterators.single( allNodes );
+            while ( allNodes.hasNext() )
-            allNodes.close();
+            {
+                names.add( allNodes.next().getProperty( "name" ).toString() );
+            }
 
-            assertEquals( name.trim(), node.getProperty( "name" ) );
+            assertTrue( names.remove( name ) );
+            assertTrue( names.remove( name.trim() ) );
+            assertTrue( names.isEmpty() );
 
             tx.success();
         }

diff --git a/...el/src/main/java/org/neo4j/unsafe/impl/batchimport/input/csv/InputEntityDeserializer.java b/...el/src/main/java/org/neo4j/unsafe/impl/batchimport/input/csv/InputEntityDeserializer.java
@@ -47,7 +47,7 @@ public class InputEntityDeserializer<ENTITY extends InputEntity> extends InputIt
     private final Decorator<ENTITY> decorator;
     private final Deserialization<ENTITY> deserialization;
     private final Validator<ENTITY> validator;
-    private final Extractors.StringExtractor stringExtractor = new Extractors.StringExtractor( false, false );
+    private final Extractors.StringExtractor stringExtractor = new Extractors.StringExtractor( false );
     private final Collector badCollector;
 
     InputEntityDeserializer( Header header, CharSeeker data, int delimiter,