Improve comments

pitrou · Feb 6, 2024 · 9a704c6 · 9a704c6
1 parent 7d03352
commit 9a704c6
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 3 deletions.
diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc
@@ -384,10 +384,12 @@ TEST(BlockParser, TruncatedData) {
 }
 
 TEST(BlockParser, TruncatedDataViews) {
-  // If non-last block is truncated, parsing stops
+  // The BlockParser API mandates that, when passing a vector of views,
+  // only the last view may be a truncated CSV block.
+  // In the current implementation, receiving a truncated non-last view
+  // simply stops parsing after that view.
   BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3);
   AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0);
-  // (XXX should we guarantee this one below?)
   AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6);
 
   // More sophisticated: non-last block ends on some newline inside a quoted string

diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
@@ -261,7 +261,10 @@ class SerialBlockReader : public BlockReader {
     auto consume_bytes = [this, bytes_before_buffer,
                           next_buffer](int64_t nbytes) -> Status {
       DCHECK_GE(nbytes, 0);
-      auto offset = nbytes - bytes_before_buffer;
+      int64_t offset = nbytes - bytes_before_buffer;
+      // All data before the buffer should have been consumed.
+      // This is checked in Parse() and BlockParsingOperator::operator().
+      DCHECK_GE(offset, 0);
       partial_ = SliceBuffer(buffer_, offset);
       buffer_ = next_buffer;
       return Status::OK();

diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
@@ -683,6 +683,9 @@ def test_chunker_out_of_sync(self):
                 rows, parse_options=ParseOptions(newlines_in_values=True),
                 read_options=ReadOptions(block_size=block_size)).to_pydict()
             assert d == expected
+        # With these block sizes, a block would end on the physical newline
+        # inside the quoted cell value, leading to a mismatch between
+        # CSV chunker and parser.
         for block_size in range(8, 11):
             with pytest.raises(ValueError,
                                match="cell values spanning multiple lines"):