diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 960a69c59db5d..41c315165b97d 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -175,6 +175,13 @@ void AssertParsePartial(BlockParser& parser, const std::string& str, ASSERT_EQ(parsed_size, expected_size); } +void AssertParsePartial(BlockParser& parser, const std::vector& data, + uint32_t expected_size) { + uint32_t parsed_size = static_cast(-1); + ASSERT_OK(parser.Parse(data, &parsed_size)); + ASSERT_EQ(parsed_size, expected_size); +} + void AssertLastRowEq(const BlockParser& parser, const std::vector& expected) { std::vector values; @@ -376,6 +383,19 @@ TEST(BlockParser, TruncatedData) { } } +TEST(BlockParser, TruncatedDataViews) { + // If non-last block is truncated, parsing stops + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3); + AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0); + // (XXX should we guarantee this one below?) + AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6); + + // More sophisticated: non-last block ends on some newline inside a quoted string + // (terse reproducer of gh-39857) + AssertParsePartial(parser, Views({"a,b,\"c\n", "\"\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\n\"d\n", "\",e,f\n"}), 6); +} + TEST(BlockParser, Final) { // Tests for ParseFinal() BlockParser parser(ParseOptions::Defaults()); diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 332fad054fea3..cf009f2227fce 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -263,8 +263,14 @@ class SerialBlockReader : public BlockReader { DCHECK_GE(nbytes, 0); auto offset = nbytes - bytes_before_buffer; if (offset < 0) { - // Should not happen - return Status::Invalid("CSV parser got out of sync with chunker"); + // This can happen if `newlines_in_values` is not enabled and + // `partial + completion` ends with a newline inside a quoted string. + // In this case, the BlockParser stops at the truncated data in the first + // block (see gh-39857). + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); } partial_ = SliceBuffer(buffer_, offset); buffer_ = next_buffer;