diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0ce08502921f3..140917a2e6341 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -813,11 +813,17 @@ Status ParquetFileFragment::EnsureCompleteMetadata(parquet::arrow::FileReader* r Status ParquetFileFragment::SetMetadata( std::shared_ptr metadata, - std::shared_ptr manifest) { + std::shared_ptr manifest, + std::shared_ptr original_metadata) { DCHECK(row_groups_.has_value()); metadata_ = std::move(metadata); manifest_ = std::move(manifest); + original_metadata_ = original_metadata ? std::move(original_metadata) : metadata_; + // The SchemaDescriptor needs to be owned by a FileMetaData instance, + // because SchemaManifest only stores a raw pointer (GH-39562). + DCHECK_EQ(manifest_->descr, original_metadata_->schema()) + << "SchemaDescriptor should be owned by the original FileMetaData"; statistics_expressions_.resize(row_groups_->size(), compute::literal(true)); statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false); @@ -846,7 +852,8 @@ Result ParquetFileFragment::SplitByRowGroup( parquet_format_.MakeFragment(source_, partition_expression(), physical_schema_, {row_group})); - RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_)); + RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_, + /*original_metadata=*/original_metadata_)); fragments[i++] = std::move(fragment); } @@ -1106,7 +1113,8 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) format_->MakeFragment({path, filesystem_}, std::move(partition_expression), physical_schema_, std::move(row_groups))); - RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_)); + RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_, + /*original_metadata=*/metadata_)); fragments[i++] = std::move(fragment); } diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 1e81a34fb3cf0..5141f36385e3f 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -188,7 +188,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { std::optional> row_groups); Status SetMetadata(std::shared_ptr metadata, - std::shared_ptr manifest); + std::shared_ptr manifest, + std::shared_ptr original_metadata = {}); // Overridden to opportunistically set metadata since a reader must be opened anyway. Result> ReadPhysicalSchemaImpl() override { @@ -219,6 +220,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { std::vector statistics_expressions_complete_; std::shared_ptr metadata_; std::shared_ptr manifest_; + // The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest. + std::shared_ptr original_metadata_; friend class ParquetFileFormat; friend class ParquetDatasetFactory;