-
Notifications
You must be signed in to change notification settings - Fork 2.3k
/
BufferedCharSeeker.java
410 lines (372 loc) · 14.1 KB
/
BufferedCharSeeker.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/*
* Copyright (c) 2002-2018 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.csv.reader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import org.neo4j.csv.reader.Source.Chunk;
import org.neo4j.values.AnyValue;
import static java.lang.String.format;
import static org.neo4j.csv.reader.Mark.END_OF_LINE_CHARACTER;
/**
* Much like a {@link BufferedReader} for a {@link Reader}.
*/
public class BufferedCharSeeker implements CharSeeker
{
private static final char EOL_CHAR = '\n';
private static final char EOL_CHAR_2 = '\r';
private static final char EOF_CHAR = (char) -1;
private static final char BACK_SLASH = '\\';
private static final char WHITESPACE = ' ';
private char[] buffer;
private int dataLength;
private int dataCapacity;
// index into the buffer character array to read the next time nextChar() is called
private int bufferPos;
private int bufferStartPos;
// last index (effectively length) of characters in use in the buffer
private int bufferEnd;
// bufferPos denoting the start of this current line that we're reading
private int lineStartPos;
// bufferPos when we started reading the current field
private int seekStartPos;
// 1-based value of which logical line we're reading a.t.m.
private int lineNumber;
// flag to know if we've read to the end
private boolean eof;
// char to recognize as quote start/end
private final char quoteChar;
// this absolute position + bufferPos is the current position in the source we're reading
private long absoluteBufferStartPosition;
private String sourceDescription;
private final boolean multilineFields;
private final boolean legacyStyleQuoting;
private final Source source;
private Chunk currentChunk;
private final boolean trim;
public BufferedCharSeeker( Source source, Configuration config )
{
this.source = source;
this.quoteChar = config.quotationCharacter();
this.lineStartPos = this.bufferPos;
this.multilineFields = config.multilineFields();
this.legacyStyleQuoting = config.legacyStyleQuoting();
this.trim = getTrimStringIgnoreErrors( config );
}
@Override
public boolean seek( Mark mark, int untilChar ) throws IOException
{
if ( eof )
{ // We're at the end
return eof( mark );
}
// Keep a start position in case we need to further fill the buffer in nextChar, a value can at maximum be the
// whole buffer, so max one fill per value is supported.
seekStartPos = bufferPos; // seekStartPos updated in nextChar if buffer flips over, that's why it's a member
int ch;
int endOffset = 1;
int skippedChars = 0;
int quoteDepth = 0;
int quoteStartLine = 0;
boolean isQuoted = false;
while ( !eof )
{
ch = nextChar( skippedChars );
if ( quoteDepth == 0 )
{ // In normal mode, i.e. not within quotes
if ( isWhitespace( ch ) && trim )
{
if ( seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
{
// We found a whitespace, which was the first of the value and we've been told to trim that off
seekStartPos++;
}
}
else if ( ch == quoteChar && seekStartPos == bufferPos - 1/* -1 since we just advanced one */ )
{ // We found a quote, which was the first of the value, skip it and switch mode
quoteDepth++;
isQuoted = true;
seekStartPos++;
quoteStartLine = lineNumber;
}
else if ( isNewLine( ch ) )
{ // Encountered newline, done for now
if ( bufferPos - 1 == lineStartPos )
{ // We're at the start of this read so just skip it
seekStartPos++;
lineStartPos++;
continue;
}
break;
}
else if ( ch == untilChar )
{ // We found a delimiter, set marker and return true
return setMark( mark, endOffset, skippedChars, ch, isQuoted );
}
else
{ // This is a character to include as part of the current value
if ( isQuoted )
{ // This value is quoted, i.e. started with a quote and has also seen a quote
throw new DataAfterQuoteException( this,
new String( buffer, seekStartPos, bufferPos - seekStartPos ) );
}
}
}
else
{ // In quoted mode, i.e. within quotes
if ( ch == quoteChar )
{ // Found a quote within a quote, peek at next char
int nextCh = peekChar( skippedChars );
if ( nextCh == quoteChar )
{ // Found a double quote, skip it and we're going down one more quote depth (quote-in-quote)
repositionChar( bufferPos++, ++skippedChars );
}
else
{ // Found an ending quote, skip it and switch mode
endOffset++;
quoteDepth--;
}
}
else if ( isNewLine( ch ) )
{ // Found a new line inside a quotation...
if ( !multilineFields )
{ // ...but we are configured to disallow it
throw new IllegalMultilineFieldException( this );
}
// ... it's OK, just keep going
if ( ch == EOL_CHAR )
{
lineNumber++;
}
}
else if ( ch == BACK_SLASH && legacyStyleQuoting )
{ // Legacy concern, support java style quote encoding
int nextCh = peekChar( skippedChars );
if ( nextCh == quoteChar || nextCh == BACK_SLASH )
{ // Found a slash encoded quote
repositionChar( bufferPos++, ++skippedChars );
}
}
else if ( eof )
{
// We have an open quote but have reached the end of the file, this is a formatting error
throw new MissingEndQuoteException( this, quoteStartLine, quoteChar );
}
}
}
int valueLength = bufferPos - seekStartPos - 1;
if ( eof && valueLength == 0 && seekStartPos == lineStartPos )
{ // We didn't find any of the characters sought for
return eof( mark );
}
// We found the last value of the line or stream
lineNumber++;
lineStartPos = bufferPos;
return setMark( mark, endOffset, skippedChars, END_OF_LINE_CHARACTER, isQuoted );
}
@Override
public <EXTRACTOR extends Extractor<?>> EXTRACTOR extract( Mark mark, EXTRACTOR extractor )
{
return extract( mark, extractor, null );
}
private boolean setMark( Mark mark, int endOffset, int skippedChars, int ch, boolean isQuoted )
{
int pos = (trim ? rtrim( bufferPos ) : bufferPos) - endOffset - skippedChars;
mark.set( seekStartPos, pos, ch, isQuoted );
return true;
}
private int rtrim( int start )
{
int index = start;
while ( isWhitespace( buffer[index - 1 /*bufferPos has advanced*/ - 1 /*don't check the last read char (delim or EOF)*/] ) )
{
index--;
}
return index;
}
private boolean isWhitespace( int ch )
{
return ch == WHITESPACE;
}
private void repositionChar( int offset, int stepsBack )
{
// We reposition characters because we might have skipped some along the way, double-quotes and what not.
// We want to take an as little hit as possible for that, so we reposition each character as long as
// we're still reading the same value. All other values will not have to take any hit of skipped chars
// for this particular value.
buffer[offset - stepsBack] = buffer[offset];
}
private boolean isNewLine( int ch )
{
return ch == EOL_CHAR || ch == EOL_CHAR_2;
}
private int peekChar( int skippedChars ) throws IOException
{
int ch = nextChar( skippedChars );
try
{
return ch;
}
finally
{
if ( ch != EOF_CHAR )
{
bufferPos--;
}
}
}
private boolean eof( Mark mark )
{
mark.set( -1, -1, Mark.END_OF_LINE_CHARACTER, false );
return false;
}
private static boolean getTrimStringIgnoreErrors( Configuration config )
{
try
{
return config.trimStrings();
}
catch ( Throwable t )
{
// Cypher compatibility can result in older Cypher 2.3 code being passed here with older implementations of
// Configuration. So we need to ignore the fact that those implementations do not include trimStrings().
return Configuration.DEFAULT.trimStrings();
}
}
@Override
public <EXTRACTOR extends Extractor<?>> EXTRACTOR extract( Mark mark, EXTRACTOR extractor, AnyValue[] optionalData )
{
if ( !tryExtract( mark, extractor, optionalData ) )
{
throw new IllegalStateException( extractor + " didn't extract value for " + mark +
". For values which are optional please use tryExtract method instead" );
}
return extractor;
}
@Override
public boolean tryExtract( Mark mark, Extractor<?> extractor, AnyValue[] optionalData )
{
int from = mark.startPosition();
int to = mark.position();
return extractor.extract( buffer, from, to - from, mark.isQuoted(), optionalData );
}
@Override
public boolean tryExtract( Mark mark, Extractor<?> extractor )
{
return tryExtract( mark, extractor, null );
}
private int nextChar( int skippedChars ) throws IOException
{
int ch;
if ( bufferPos < bufferEnd || fillBuffer() )
{
ch = buffer[bufferPos];
}
else
{
ch = EOF_CHAR;
eof = true;
}
if ( skippedChars > 0 )
{
repositionChar( bufferPos, skippedChars );
}
bufferPos++;
return ch;
}
/**
* @return {@code true} if something was read, otherwise {@code false} which means that we reached EOF.
*/
private boolean fillBuffer() throws IOException
{
boolean first = currentChunk == null;
if ( !first )
{
if ( bufferPos - seekStartPos >= dataCapacity )
{
throw new IllegalStateException( "Tried to read a field larger than buffer size " +
dataLength + ". A common cause of this is that a field has an unterminated " +
"quote and so will try to seek until the next quote, which ever line it may be on." +
" This should not happen if multi-line fields are disabled, given that the fields contains " +
"no new-line characters. This field started at " + sourceDescription() + ":" + lineNumber() );
}
}
absoluteBufferStartPosition += dataLength;
// Fill the buffer with new characters
Chunk nextChunk = source.nextChunk( first ? -1 : seekStartPos );
if ( nextChunk == Source.EMPTY_CHUNK )
{
return false;
}
buffer = nextChunk.data();
dataLength = nextChunk.length();
dataCapacity = nextChunk.maxFieldSize();
bufferPos = nextChunk.startPosition();
bufferStartPos = bufferPos;
bufferEnd = bufferPos + dataLength;
int shift = seekStartPos - nextChunk.backPosition();
seekStartPos = nextChunk.backPosition();
if ( first )
{
lineStartPos = seekStartPos;
}
else
{
lineStartPos -= shift;
}
String sourceDescriptionAfterRead = nextChunk.sourceDescription();
if ( !sourceDescriptionAfterRead.equals( sourceDescription ) )
{ // We moved over to a new source, reset line number
lineNumber = 0;
sourceDescription = sourceDescriptionAfterRead;
}
currentChunk = nextChunk;
return dataLength > 0;
}
@Override
public void close() throws IOException
{
source.close();
}
@Override
public long position()
{
return absoluteBufferStartPosition + (bufferPos - bufferStartPos);
}
@Override
public String sourceDescription()
{
return sourceDescription;
}
public long lineNumber()
{
return lineNumber;
}
@Override
public String toString()
{
return format( "%s[source:%s, position:%d, line:%d]", getClass().getSimpleName(),
sourceDescription(), position(), lineNumber() );
}
public static boolean isEolChar( char c )
{
return c == EOL_CHAR || c == EOL_CHAR_2;
}
}