From ca359ba7b478f369f715cc620de610ed2cc13e6c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 19 Feb 2024 16:37:43 +0100 Subject: [PATCH] MINOR: [C++][Parquet][Tools] Print FIXED_LEN_BYTE_ARRAY length In `ParquetFilePrinter`, when printing the type of the column, also print its byte width if the type is FIXED_LEN_BYTE_ARRAY. Before: ``` Column 0: float16_plain (FIXED_LEN_BYTE_ARRAY / Float16) Column 1: float16_byte_stream_split (FIXED_LEN_BYTE_ARRAY / Float16) Column 2: float_plain (FLOAT) Column 3: float_byte_stream_split (FLOAT) Column 4: double_plain (DOUBLE) Column 5: double_byte_stream_split (DOUBLE) Column 6: int32_plain (INT32) Column 7: int32_byte_stream_split (INT32) Column 8: int64_plain (INT64) Column 9: int64_byte_stream_split (INT64) Column 10: flba5_plain (FIXED_LEN_BYTE_ARRAY) Column 11: flba5_byte_stream_split (FIXED_LEN_BYTE_ARRAY) Column 12: decimal_plain (FIXED_LEN_BYTE_ARRAY / Decimal(precision=7, scale=3) / DECIMAL(7,3)) Column 13: decimal_byte_stream_split (FIXED_LEN_BYTE_ARRAY / Decimal(precision=7, scale=3) / DECIMAL(7,3)) ``` After: ``` Column 0: float16_plain (FIXED_LEN_BYTE_ARRAY(2) / Float16) Column 1: float16_byte_stream_split (FIXED_LEN_BYTE_ARRAY(2) / Float16) Column 2: float_plain (FLOAT) Column 3: float_byte_stream_split (FLOAT) Column 4: double_plain (DOUBLE) Column 5: double_byte_stream_split (DOUBLE) Column 6: int32_plain (INT32) Column 7: int32_byte_stream_split (INT32) Column 8: int64_plain (INT64) Column 9: int64_byte_stream_split (INT64) Column 10: flba5_plain (FIXED_LEN_BYTE_ARRAY(5)) Column 11: flba5_byte_stream_split (FIXED_LEN_BYTE_ARRAY(5)) Column 12: decimal_plain (FIXED_LEN_BYTE_ARRAY(4) / Decimal(precision=7, scale=3) / DECIMAL(7,3)) Column 13: decimal_byte_stream_split (FIXED_LEN_BYTE_ARRAY(4) / Decimal(precision=7, scale=3) / DECIMAL(7,3)) ``` --- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/types.cc | 10 ++++++++++ cpp/src/parquet/types.h | 2 ++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index f11397ab96ed8..ce194f897e44d 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -105,7 +105,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte for (auto i : selected_columns) { const ColumnDescriptor* descr = file_metadata->schema()->Column(i); stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" - << TypeToString(descr->physical_type()); + << TypeToString(descr->physical_type(), descr->type_length()); const auto& logical_type = descr->logical_type(); if (!logical_type->is_none()) { stream << " / " << logical_type->ToString(); diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 33fed01ba324f..7b50ed48d06b0 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -177,6 +177,16 @@ std::string TypeToString(Type::type t) { } } +std::string TypeToString(Type::type t, int type_length) { + auto s = TypeToString(t); + if (t == Type::FIXED_LEN_BYTE_ARRAY) { + s += '('; + s += std::to_string(type_length); + s += ')'; + } + return s; +} + std::string ConvertedTypeToString(ConvertedType::type t) { switch (t) { case ConvertedType::NONE: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 76dd0efc7cb4a..38529bceae85f 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -796,6 +796,8 @@ PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t); PARQUET_EXPORT std::string TypeToString(Type::type t); +PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length); + PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, ::std::string_view val);