Skip to content

Commit

Permalink
ARROW-11365: [Rust] [Parquet] Logical type printer and parser
Browse files Browse the repository at this point in the history
This implements the parser and printer for logical types, allowing us to read and generate the schema in the form `REQUIRED INT32 field_name (INTEGER(16,false))`.

Closes apache#9705 from nevi-me/ARROW-11365

Authored-by: Neville Dipale <nevilledips@gmail.com>
Signed-off-by: Neville Dipale <nevilledips@gmail.com>
  • Loading branch information
nevi-me authored and michalursa committed Jun 13, 2021
1 parent 818208f commit 77b1212
Show file tree
Hide file tree
Showing 4 changed files with 903 additions and 128 deletions.
51 changes: 40 additions & 11 deletions rust/parquet/src/arrow/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -942,11 +942,14 @@ mod tests {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 uint8 (INTEGER(8,false));
REQUIRED INT32 uint16 (INTEGER(16,false));
REQUIRED INT32 int32;
REQUIRED INT64 int64 ;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string_2 (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand All @@ -959,11 +962,14 @@ mod tests {
Field::new("boolean", DataType::Boolean, false),
Field::new("int8", DataType::Int8, false),
Field::new("int16", DataType::Int16, false),
Field::new("uint8", DataType::UInt8, false),
Field::new("uint16", DataType::UInt16, false),
Field::new("int32", DataType::Int32, false),
Field::new("int64", DataType::Int64, false),
Field::new("double", DataType::Float64, true),
Field::new("float", DataType::Float32, true),
Field::new("string", DataType::Utf8, true),
Field::new("string_2", DataType::Utf8, true),
];

assert_eq!(&arrow_fields, converted_arrow_schema.fields());
Expand Down Expand Up @@ -1508,18 +1514,22 @@ mod tests {
message test_schema {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 uint8 (INTEGER(8,false));
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 uint16 (INTEGER(16,false));
REQUIRED INT32 int32;
REQUIRED INT64 int64 ;
REQUIRED INT64 int64;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
REPEATED BOOLEAN bools;
OPTIONAL INT32 date (DATE);
OPTIONAL INT32 time_milli (TIME_MILLIS);
OPTIONAL INT64 time_micro (TIME_MICROS);
OPTIONAL INT64 time_nano (TIME(NANOS,false));
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true));
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand All @@ -1534,7 +1544,9 @@ mod tests {
let arrow_fields = vec![
Field::new("boolean", DataType::Boolean, false),
Field::new("int8", DataType::Int8, false),
Field::new("uint8", DataType::UInt8, false),
Field::new("int16", DataType::Int16, false),
Field::new("uint16", DataType::UInt16, false),
Field::new("int32", DataType::Int32, false),
Field::new("int64", DataType::Int64, false),
Field::new("double", DataType::Float64, true),
Expand All @@ -1548,6 +1560,7 @@ mod tests {
Field::new("date", DataType::Date32, true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true),
Field::new(
"ts_milli",
DataType::Timestamp(TimeUnit::Millisecond, None),
Expand All @@ -1558,24 +1571,28 @@ mod tests {
DataType::Timestamp(TimeUnit::Microsecond, None),
false,
),
Field::new(
"ts_nano",
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())),
false,
),
];

assert_eq!(arrow_fields, converted_arrow_fields);
}

#[test]
#[ignore = "To be addressed as part of ARROW-11365"]
fn test_field_to_column_desc() {
let message_type = "
message arrow_schema {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 int16 (INTEGER(16,true));
REQUIRED INT32 int32;
REQUIRED INT64 int64;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string (STRING);
OPTIONAL GROUP bools (LIST) {
REPEATED GROUP list {
OPTIONAL BOOLEAN element;
Expand All @@ -1587,20 +1604,20 @@ mod tests {
}
}
OPTIONAL INT32 date (DATE);
OPTIONAL INT32 time_milli (TIME_MILLIS);
OPTIONAL INT32 time_milli (TIME(MILLIS,false));
OPTIONAL INT64 time_micro (TIME_MICROS);
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false));
REQUIRED GROUP struct {
REQUIRED BOOLEAN bools;
REQUIRED INT32 uint32 (UINT_32);
REQUIRED INT32 uint32 (INTEGER(32,false));
REQUIRED GROUP int32 (LIST) {
REPEATED GROUP list {
OPTIONAL INT32 element;
}
}
}
REQUIRED BINARY dictionary_strings (UTF8);
REQUIRED BINARY dictionary_strings (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand Down Expand Up @@ -1674,8 +1691,20 @@ mod tests {
.iter()
.zip(converted_arrow_schema.columns())
.for_each(|(a, b)| {
// TODO: ARROW-11365: If parsing v1 format, there should be no logical type
assert_eq!(a, b);
// Only check logical type if it's set on the Parquet side.
// This is because the Arrow conversion always sets logical type,
// even if there wasn't originally one.
// This is not an issue, but is an inconvenience for this test.
match a.logical_type() {
Some(_) => {
assert_eq!(a, b)
}
None => {
assert_eq!(a.name(), b.name());
assert_eq!(a.physical_type(), b.physical_type());
assert_eq!(a.converted_type(), b.converted_type());
}
};
});
}

Expand All @@ -1694,7 +1723,7 @@ mod tests {
fn test_metadata() {
let message_type = "
message test_schema {
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand Down
73 changes: 8 additions & 65 deletions rust/parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -848,84 +848,27 @@ impl str::FromStr for LogicalType {

fn from_str(s: &str) -> result::Result<Self, Self::Err> {
match s {
"INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType {
// The type is a placeholder that gets updated elsewhere
"INTEGER" => Ok(LogicalType::INTEGER(IntType {
bit_width: 8,
is_signed: true,
})),
"INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 16,
is_signed: true,
})),
"INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 32,
is_signed: true,
})),
"INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 64,
is_signed: true,
})),
"INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 8,
is_signed: false,
})),
"INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 16,
is_signed: false,
})),
"INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 32,
is_signed: false,
})),
"INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 64,
is_signed: false,
})),
"MAP" => Ok(LogicalType::MAP(MapType {})),
"LIST" => Ok(LogicalType::LIST(ListType {})),
"ENUM" => Ok(LogicalType::ENUM(EnumType {})),
// TODO: ARROW-11365
// "DECIMAL" => Ok(LogicalType::DECIMAL),
"DATE" => Ok(LogicalType::DATE(DateType {})),
"TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
"DECIMAL" => Ok(LogicalType::DECIMAL(DecimalType {
precision: -1,
scale: -1,
})),
"TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType {
"DATE" => Ok(LogicalType::DATE(DateType {})),
"TIME" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
"TIMESTAMP" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"STRING" => Ok(LogicalType::STRING(StringType {})),
"JSON" => Ok(LogicalType::JSON(JsonType {})),
"BSON" => Ok(LogicalType::BSON(BsonType {})),
Expand Down
Loading

0 comments on commit 77b1212

Please sign in to comment.