Skip to content

Commit

Permalink
Improve comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Feb 6, 2024
1 parent 7d03352 commit 9a704c6
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 3 deletions.
6 changes: 4 additions & 2 deletions cpp/src/arrow/csv/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,12 @@ TEST(BlockParser, TruncatedData) {
}

TEST(BlockParser, TruncatedDataViews) {
// If non-last block is truncated, parsing stops
// The BlockParser API mandates that, when passing a vector of views,
// only the last view may be a truncated CSV block.
// In the current implementation, receiving a truncated non-last view
// simply stops parsing after that view.
BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3);
AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0);
// (XXX should we guarantee this one below?)
AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6);

// More sophisticated: non-last block ends on some newline inside a quoted string
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/arrow/csv/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,10 @@ class SerialBlockReader : public BlockReader {
auto consume_bytes = [this, bytes_before_buffer,
next_buffer](int64_t nbytes) -> Status {
DCHECK_GE(nbytes, 0);
auto offset = nbytes - bytes_before_buffer;
int64_t offset = nbytes - bytes_before_buffer;
// All data before the buffer should have been consumed.
// This is checked in Parse() and BlockParsingOperator::operator().
DCHECK_GE(offset, 0);
partial_ = SliceBuffer(buffer_, offset);
buffer_ = next_buffer;
return Status::OK();
Expand Down
3 changes: 3 additions & 0 deletions python/pyarrow/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,9 @@ def test_chunker_out_of_sync(self):
rows, parse_options=ParseOptions(newlines_in_values=True),
read_options=ReadOptions(block_size=block_size)).to_pydict()
assert d == expected
# With these block sizes, a block would end on the physical newline
# inside the quoted cell value, leading to a mismatch between
# CSV chunker and parser.
for block_size in range(8, 11):
with pytest.raises(ValueError,
match="cell values spanning multiple lines"):
Expand Down

0 comments on commit 9a704c6

Please sign in to comment.