Skip to content

Commit

Permalink
apacheGH-39857: [C++] Improve error message for "chunker out of sync"…
Browse files Browse the repository at this point in the history
… condition
  • Loading branch information
pitrou committed Feb 5, 2024
1 parent 85e2a68 commit f0e4fbd
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
20 changes: 20 additions & 0 deletions cpp/src/arrow/csv/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ void AssertParsePartial(BlockParser& parser, const std::string& str,
ASSERT_EQ(parsed_size, expected_size);
}

void AssertParsePartial(BlockParser& parser, const std::vector<std::string_view>& data,
uint32_t expected_size) {
uint32_t parsed_size = static_cast<uint32_t>(-1);
ASSERT_OK(parser.Parse(data, &parsed_size));
ASSERT_EQ(parsed_size, expected_size);
}

void AssertLastRowEq(const BlockParser& parser,
const std::vector<std::string>& expected) {
std::vector<std::string> values;
Expand Down Expand Up @@ -376,6 +383,19 @@ TEST(BlockParser, TruncatedData) {
}
}

TEST(BlockParser, TruncatedDataViews) {
// If non-last block is truncated, parsing stops
BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3);
AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0);
// (XXX should we guarantee this one below?)
AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6);

// More sophisticated: non-last block ends on some newline inside a quoted string
// (terse reproducer of gh-39857)
AssertParsePartial(parser, Views({"a,b,\"c\n", "\"\n"}), 0);
AssertParsePartial(parser, Views({"a,b,c\n\"d\n", "\",e,f\n"}), 6);
}

TEST(BlockParser, Final) {
// Tests for ParseFinal()
BlockParser parser(ParseOptions::Defaults());
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/arrow/csv/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,14 @@ class SerialBlockReader : public BlockReader {
DCHECK_GE(nbytes, 0);
auto offset = nbytes - bytes_before_buffer;
if (offset < 0) {
// Should not happen
return Status::Invalid("CSV parser got out of sync with chunker");
// This can happen if `newlines_in_values` is not enabled and
// `partial + completion` ends with a newline inside a quoted string.
// In this case, the BlockParser stops at the truncated data in the first
// block (see gh-39857).
return Status::Invalid(
"CSV parser got out of sync with chunker. This can mean the data file "
"contains cell values spanning multiple lines; please consider enabling "
"the option 'newlines_in_values'.");
}
partial_ = SliceBuffer(buffer_, offset);
buffer_ = next_buffer;
Expand Down

0 comments on commit f0e4fbd

Please sign in to comment.