Skip to content

Commit

Permalink
ARROW-11149: [Rust] DF Support List/LargeList/FixedSizeList in create…
Browse files Browse the repository at this point in the history
…_batch_empty

Previously `build_empty_list_array` was declared inside Parquet (`array_reader`), but I will use this function inside DataFushion's  `create_batch_empty` (it's used inside hash_aggregate to make an empty batch from the provided schema that contains type for columns).  I moved it to Arrow (because it's common and useful) and made `build_empty_large_list_array` (for large lists) on top of macros with different implementation than build_empty_list_array.

Closes apache#9114 from ovr/issue-11149

Authored-by: Dmitry Patsura <zaets28rus@gmail.com>
Signed-off-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
ovr authored and michalursa committed Jun 13, 2021
1 parent ae5d30c commit 5a41f4a
Show file tree
Hide file tree
Showing 5 changed files with 370 additions and 119 deletions.
349 changes: 339 additions & 10 deletions rust/arrow/src/array/array_list.rs
Expand Up @@ -19,15 +19,19 @@ use std::any::Any;
use std::convert::From;
use std::fmt;
use std::mem;
use std::sync::Arc;

use num::Num;

use super::{
array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayDataRef,
ArrayRef,
ArrayRef, BinaryBuilder, BooleanBuilder, FixedSizeListBuilder, PrimitiveBuilder,
StringBuilder,
};
use crate::array::builder::GenericListBuilder;
use crate::datatypes::ArrowNativeType;
use crate::datatypes::DataType;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};

/// trait declaring an offset size, relevant for i32 vs i64 array types.
pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord + std::ops::AddAssign {
Expand Down Expand Up @@ -238,13 +242,16 @@ impl From<ArrayDataRef> for FixedSizeListArray {
let values = make_array(data.child_data()[0].clone());
let length = match data.data_type() {
DataType::FixedSizeList(_, len) => {
// check that child data is multiple of length
assert_eq!(
values.len() % *len as usize,
0,
"FixedSizeListArray child array length should be a multiple of {}",
len
);
if *len > 0 {
// check that child data is multiple of length
assert_eq!(
values.len() % *len as usize,
0,
"FixedSizeListArray child array length should be a multiple of {}",
len
);
}

*len
}
_ => {
Expand Down Expand Up @@ -295,10 +302,268 @@ impl fmt::Debug for FixedSizeListArray {
}
}

macro_rules! build_empty_list_array_with_primitive_items {
($item_type:ident, $offset_type:ident) => {{
let values_builder = PrimitiveBuilder::<$item_type>::new(0);
let mut builder =
GenericListBuilder::<$offset_type, PrimitiveBuilder<$item_type>>::new(
values_builder,
);
let empty_list_array = builder.finish();
Ok(Arc::new(empty_list_array))
}};
}

macro_rules! build_empty_list_array_with_non_primitive_items {
($type_builder:ident, $offset_type:ident) => {{
let values_builder = $type_builder::new(0);
let mut builder =
GenericListBuilder::<$offset_type, $type_builder>::new(values_builder);
let empty_list_array = builder.finish();
Ok(Arc::new(empty_list_array))
}};
}

pub fn build_empty_list_array<OffsetSize: OffsetSizeTrait>(
item_type: DataType,
) -> Result<ArrayRef> {
match item_type {
DataType::UInt8 => {
build_empty_list_array_with_primitive_items!(UInt8Type, OffsetSize)
}
DataType::UInt16 => {
build_empty_list_array_with_primitive_items!(UInt16Type, OffsetSize)
}
DataType::UInt32 => {
build_empty_list_array_with_primitive_items!(UInt32Type, OffsetSize)
}
DataType::UInt64 => {
build_empty_list_array_with_primitive_items!(UInt64Type, OffsetSize)
}
DataType::Int8 => {
build_empty_list_array_with_primitive_items!(Int8Type, OffsetSize)
}
DataType::Int16 => {
build_empty_list_array_with_primitive_items!(Int16Type, OffsetSize)
}
DataType::Int32 => {
build_empty_list_array_with_primitive_items!(Int32Type, OffsetSize)
}
DataType::Int64 => {
build_empty_list_array_with_primitive_items!(Int64Type, OffsetSize)
}
DataType::Float32 => {
build_empty_list_array_with_primitive_items!(Float32Type, OffsetSize)
}
DataType::Float64 => {
build_empty_list_array_with_primitive_items!(Float64Type, OffsetSize)
}
DataType::Boolean => {
build_empty_list_array_with_non_primitive_items!(BooleanBuilder, OffsetSize)
}
DataType::Date32(_) => {
build_empty_list_array_with_primitive_items!(Date32Type, OffsetSize)
}
DataType::Date64(_) => {
build_empty_list_array_with_primitive_items!(Date64Type, OffsetSize)
}
DataType::Time32(TimeUnit::Second) => {
build_empty_list_array_with_primitive_items!(Time32SecondType, OffsetSize)
}
DataType::Time32(TimeUnit::Millisecond) => {
build_empty_list_array_with_primitive_items!(
Time32MillisecondType,
OffsetSize
)
}
DataType::Time64(TimeUnit::Microsecond) => {
build_empty_list_array_with_primitive_items!(
Time64MicrosecondType,
OffsetSize
)
}
DataType::Time64(TimeUnit::Nanosecond) => {
build_empty_list_array_with_primitive_items!(Time64NanosecondType, OffsetSize)
}
DataType::Duration(TimeUnit::Second) => {
build_empty_list_array_with_primitive_items!(DurationSecondType, OffsetSize)
}
DataType::Duration(TimeUnit::Millisecond) => {
build_empty_list_array_with_primitive_items!(
DurationMillisecondType,
OffsetSize
)
}
DataType::Duration(TimeUnit::Microsecond) => {
build_empty_list_array_with_primitive_items!(
DurationMicrosecondType,
OffsetSize
)
}
DataType::Duration(TimeUnit::Nanosecond) => {
build_empty_list_array_with_primitive_items!(
DurationNanosecondType,
OffsetSize
)
}
DataType::Timestamp(TimeUnit::Second, _) => {
build_empty_list_array_with_primitive_items!(TimestampSecondType, OffsetSize)
}
DataType::Timestamp(TimeUnit::Millisecond, _) => {
build_empty_list_array_with_primitive_items!(
TimestampMillisecondType,
OffsetSize
)
}
DataType::Timestamp(TimeUnit::Microsecond, _) => {
build_empty_list_array_with_primitive_items!(
TimestampMicrosecondType,
OffsetSize
)
}
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
build_empty_list_array_with_primitive_items!(
TimestampNanosecondType,
OffsetSize
)
}
DataType::Utf8 => {
build_empty_list_array_with_non_primitive_items!(StringBuilder, OffsetSize)
}
DataType::Binary => {
build_empty_list_array_with_non_primitive_items!(BinaryBuilder, OffsetSize)
}
_ => Err(ArrowError::NotYetImplemented(format!(
"GenericListBuilder of type List({:?}) is not supported",
item_type
))),
}
}

macro_rules! build_empty_fixed_size_list_array_with_primitive_items {
($item_type:ident) => {{
let values_builder = PrimitiveBuilder::<$item_type>::new(0);
let mut builder = FixedSizeListBuilder::new(values_builder, 0);
let empty_list_array = builder.finish();
Ok(Arc::new(empty_list_array))
}};
}

macro_rules! build_empty_fixed_size_list_array_with_non_primitive_items {
($type_builder:ident) => {{
let values_builder = $type_builder::new(0);
let mut builder = FixedSizeListBuilder::new(values_builder, 0);
let empty_list_array = builder.finish();
Ok(Arc::new(empty_list_array))
}};
}

pub fn build_empty_fixed_size_list_array(item_type: DataType) -> Result<ArrayRef> {
match item_type {
DataType::UInt8 => {
build_empty_fixed_size_list_array_with_primitive_items!(UInt8Type)
}
DataType::UInt16 => {
build_empty_fixed_size_list_array_with_primitive_items!(UInt16Type)
}
DataType::UInt32 => {
build_empty_fixed_size_list_array_with_primitive_items!(UInt32Type)
}
DataType::UInt64 => {
build_empty_fixed_size_list_array_with_primitive_items!(UInt64Type)
}
DataType::Int8 => {
build_empty_fixed_size_list_array_with_primitive_items!(Int8Type)
}
DataType::Int16 => {
build_empty_fixed_size_list_array_with_primitive_items!(Int16Type)
}
DataType::Int32 => {
build_empty_fixed_size_list_array_with_primitive_items!(Int32Type)
}
DataType::Int64 => {
build_empty_fixed_size_list_array_with_primitive_items!(Int64Type)
}
DataType::Float32 => {
build_empty_fixed_size_list_array_with_primitive_items!(Float32Type)
}
DataType::Float64 => {
build_empty_fixed_size_list_array_with_primitive_items!(Float64Type)
}
DataType::Boolean => {
build_empty_fixed_size_list_array_with_non_primitive_items!(BooleanBuilder)
}
DataType::Date32(_) => {
build_empty_fixed_size_list_array_with_primitive_items!(Date32Type)
}
DataType::Date64(_) => {
build_empty_fixed_size_list_array_with_primitive_items!(Date64Type)
}
DataType::Time32(TimeUnit::Second) => {
build_empty_fixed_size_list_array_with_primitive_items!(Time32SecondType)
}
DataType::Time32(TimeUnit::Millisecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(Time32MillisecondType)
}
DataType::Time64(TimeUnit::Microsecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(Time64MicrosecondType)
}
DataType::Time64(TimeUnit::Nanosecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(Time64NanosecondType)
}
DataType::Duration(TimeUnit::Second) => {
build_empty_fixed_size_list_array_with_primitive_items!(DurationSecondType)
}
DataType::Duration(TimeUnit::Millisecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(
DurationMillisecondType
)
}
DataType::Duration(TimeUnit::Microsecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(
DurationMicrosecondType
)
}
DataType::Duration(TimeUnit::Nanosecond) => {
build_empty_fixed_size_list_array_with_primitive_items!(
DurationNanosecondType
)
}
DataType::Timestamp(TimeUnit::Second, _) => {
build_empty_fixed_size_list_array_with_primitive_items!(TimestampSecondType)
}
DataType::Timestamp(TimeUnit::Millisecond, _) => {
build_empty_fixed_size_list_array_with_primitive_items!(
TimestampMillisecondType
)
}
DataType::Timestamp(TimeUnit::Microsecond, _) => {
build_empty_fixed_size_list_array_with_primitive_items!(
TimestampMicrosecondType
)
}
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
build_empty_fixed_size_list_array_with_primitive_items!(
TimestampNanosecondType
)
}
DataType::Utf8 => {
build_empty_fixed_size_list_array_with_non_primitive_items!(StringBuilder)
}
DataType::Binary => {
build_empty_fixed_size_list_array_with_non_primitive_items!(BinaryBuilder)
}
_ => Err(ArrowError::NotYetImplemented(format!(
"FixedSizeListBuilder of type FixedSizeList({:?}) is not supported",
item_type
))),
}
}

#[cfg(test)]
mod tests {
use crate::{
array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field,
array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, memory,
util::bit_util,
};

Expand Down Expand Up @@ -770,4 +1035,68 @@ mod tests {
.build();
ListArray::from(list_data);
}

#[test]
#[should_panic(expected = "memory is not aligned")]
fn test_primitive_array_alignment() {
let ptr = memory::allocate_aligned(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf2 = buf.slice(1);
let array_data = ArrayData::builder(DataType::Int32).add_buffer(buf2).build();
Int32Array::from(array_data);
}

#[test]
#[should_panic(expected = "memory is not aligned")]
fn test_list_array_alignment() {
let ptr = memory::allocate_aligned(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf2 = buf.slice(1);

let values: [i32; 8] = [0; 8];
let value_data = ArrayData::builder(DataType::Int32)
.add_buffer(Buffer::from(values.to_byte_slice()))
.build();

let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.add_buffer(buf2)
.add_child_data(value_data)
.build();
ListArray::from(list_data);
}

macro_rules! make_test_build_empty_list_array {
($OFFSET:ident) => {
build_empty_list_array::<$OFFSET>(DataType::Boolean).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Int16).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Int32).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Int64).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Float32).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Float64).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Boolean).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Utf8).unwrap();
build_empty_list_array::<$OFFSET>(DataType::Binary).unwrap();
};
}

#[test]
fn test_build_empty_list_array() {
make_test_build_empty_list_array!(i32);
make_test_build_empty_list_array!(i64);
}

#[test]
fn test_build_empty_fixed_size_list_array() {
build_empty_fixed_size_list_array(DataType::Boolean).unwrap();
build_empty_fixed_size_list_array(DataType::Int16).unwrap();
build_empty_fixed_size_list_array(DataType::Int32).unwrap();
build_empty_fixed_size_list_array(DataType::Int64).unwrap();
build_empty_fixed_size_list_array(DataType::Float32).unwrap();
build_empty_fixed_size_list_array(DataType::Float64).unwrap();
build_empty_fixed_size_list_array(DataType::Boolean).unwrap();
build_empty_fixed_size_list_array(DataType::Utf8).unwrap();
build_empty_fixed_size_list_array(DataType::Binary).unwrap();
}
}
2 changes: 2 additions & 0 deletions rust/arrow/src/array/mod.rs
Expand Up @@ -119,6 +119,8 @@ pub use self::array_binary::FixedSizeBinaryArray;
pub use self::array_binary::LargeBinaryArray;
pub use self::array_boolean::BooleanArray;
pub use self::array_dictionary::DictionaryArray;
pub use self::array_list::build_empty_fixed_size_list_array;
pub use self::array_list::build_empty_list_array;
pub use self::array_list::FixedSizeListArray;
pub use self::array_list::LargeListArray;
pub use self::array_list::ListArray;
Expand Down

0 comments on commit 5a41f4a

Please sign in to comment.