diff --git a/buffer_go18_test.go b/buffer_go18_test.go index aea08f94..5833d6c6 100644 --- a/buffer_go18_test.go +++ b/buffer_go18_test.go @@ -25,6 +25,8 @@ func TestGenericBuffer(t *testing.T) { testGenericBuffer[stringColumn](t) testGenericBuffer[indexedStringColumn](t) testGenericBuffer[uuidColumn](t) + testGenericBuffer[timeColumn](t) + testGenericBuffer[timeInMillisColumn](t) testGenericBuffer[mapColumn](t) testGenericBuffer[decimalColumn](t) testGenericBuffer[addressBook](t) @@ -108,6 +110,8 @@ func BenchmarkGenericBuffer(b *testing.B) { benchmarkGenericBuffer[stringColumn](b) benchmarkGenericBuffer[indexedStringColumn](b) benchmarkGenericBuffer[uuidColumn](b) + benchmarkGenericBuffer[timeColumn](b) + benchmarkGenericBuffer[timeInMillisColumn](b) benchmarkGenericBuffer[mapColumn](b) benchmarkGenericBuffer[decimalColumn](b) benchmarkGenericBuffer[contact](b) diff --git a/column.go b/column.go index a376c7d5..c030df44 100644 --- a/column.go +++ b/column.go @@ -381,8 +381,27 @@ func schemaElementTypeOf(s *format.SchemaElement) Type { case lt.Enum != nil: return (*enumType)(lt.Enum) case lt.Decimal != nil: - // TODO: - // return (*decimalType)(lt.Decimal) + // A parquet decimal can be one of several different physical types. + if t := s.Type; t != nil { + var typ Type + switch kind := Kind(*s.Type); kind { + case Int32: + typ = Int32Type + case Int64: + typ = Int64Type + case FixedLenByteArray: + if s.TypeLength == nil { + panic("DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length") + } + typ = FixedLenByteArrayType(int(*s.TypeLength)) + default: + panic("DECIMAL must be of type INT32, INT64, or FIXED_LEN_BYTE_ARRAY but got " + kind.String()) + } + return &decimalType{ + decimal: *lt.Decimal, + Type: typ, + } + } case lt.Date != nil: return (*dateType)(lt.Date) case lt.Time != nil: diff --git a/column_buffer_go18.go b/column_buffer_go18.go index c500392c..539392cb 100644 --- a/column_buffer_go18.go +++ b/column_buffer_go18.go @@ -5,6 +5,7 @@ package parquet import ( "math/bits" "reflect" + "time" "unsafe" "github.com/segmentio/parquet-go/deprecated" @@ -30,6 +31,8 @@ func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsF switch t { case reflect.TypeOf(deprecated.Int96{}): return writeRowsFuncOfRequired(t, schema, path) + case reflect.TypeOf(time.Time{}): + return writeRowsFuncOfTime(t, schema, path) } switch t.Kind() { @@ -393,3 +396,43 @@ func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRo return nil } } + +func writeRowsFuncOfTime(_ reflect.Type, schema *Schema, path columnPath) writeRowsFunc { + t := reflect.TypeOf(int64(0)) + elemSize := uintptr(t.Size()) + writeRows := writeRowsFuncOf(t, schema, path) + + col, _ := schema.Lookup(path...) + unit := Nanosecond.TimeUnit() + lt := col.Node.Type().LogicalType() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + times := rows.TimeArray() + for i := 0; i < times.Len(); i++ { + t := times.Index(i) + var val int64 + switch { + case unit.Millis != nil: + val = t.UnixMilli() + case unit.Micros != nil: + val = t.UnixMicro() + default: + val = t.UnixNano() + } + + a := makeArray(unsafecast.PointerOfValue(reflect.ValueOf(val)), 1, elemSize) + if err := writeRows(columns, a, levels); err != nil { + return err + } + } + + return nil + } +} diff --git a/convert_test.go b/convert_test.go index a92657b8..fe69b321 100644 --- a/convert_test.go +++ b/convert_test.go @@ -3,6 +3,7 @@ package parquet_test import ( "reflect" "testing" + "time" "github.com/segmentio/parquet-go" ) @@ -309,3 +310,91 @@ func TestConvert(t *testing.T) { func newInt64(i int64) *int64 { return &i } func newString(s string) *string { return &s } + +func TestConvertTimestamp(t *testing.T) { + now := time.Unix(42, 0) + ms := now.UnixMilli() + us := now.UnixMicro() + ns := now.UnixNano() + + msType := parquet.Timestamp(parquet.Millisecond).Type() + msVal := parquet.ValueOf(ms) + if msVal.Int64() != ms { + t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", ms, msVal.Int64()) + } + + usType := parquet.Timestamp(parquet.Microsecond).Type() + usVal := parquet.ValueOf(us) + if usVal.Int64() != us { + t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", us, usVal.Int64()) + } + + nsType := parquet.Timestamp(parquet.Nanosecond).Type() + nsVal := parquet.ValueOf(ns) + if nsVal.Int64() != ns { + t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", ns, nsVal.Int64()) + } + + var timestampConversionTests = [...]struct { + scenario string + fromType parquet.Type + fromValue parquet.Value + toType parquet.Type + expected int64 + }{ + { + scenario: "micros to nanos", + fromType: usType, + fromValue: usVal, + toType: nsType, + expected: ns, + }, + { + scenario: "millis to nanos", + fromType: msType, + fromValue: msVal, + toType: nsType, + expected: ns, + }, + { + scenario: "nanos to micros", + fromType: nsType, + fromValue: nsVal, + toType: usType, + expected: us, + }, + { + scenario: "nanos to nanos", + fromType: nsType, + fromValue: nsVal, + toType: nsType, + expected: ns, + }, + { + scenario: "int64 to nanos", + fromType: parquet.Int64Type, + fromValue: nsVal, + toType: nsType, + expected: ns, + }, + { + scenario: "int64 to int64", + fromType: parquet.Int64Type, + fromValue: nsVal, + toType: parquet.Int64Type, + expected: ns, + }, + } + + for _, test := range timestampConversionTests { + t.Run(test.scenario, func(t *testing.T) { + a, err := test.toType.ConvertValue(test.fromValue, test.fromType) + if err != nil { + t.Fatal(err) + } + if a.Int64() != test.expected { + t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", test.expected, a.Int64()) + } + }) + } +} diff --git a/file.go b/file.go index 164c517e..0dcbb1af 100644 --- a/file.go +++ b/file.go @@ -174,6 +174,10 @@ func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { // this case the page index is not cached within the file, programs are expected // to make use of independently from the parquet package. func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) { + if len(f.metadata.RowGroups) == 0 { + return nil, nil, nil + } + columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset columnIndexLength := int64(0) diff --git a/format/parquet.go b/format/parquet.go index 08c1c986..6ecdd3e7 100644 --- a/format/parquet.go +++ b/format/parquet.go @@ -134,7 +134,8 @@ type DecimalType struct { } func (t *DecimalType) String() string { - return fmt.Sprintf("DECIMAL(%d,%d)", t.Scale, t.Precision) + // Matching parquet-cli's decimal string format: https://github.com/apache/parquet-mr/blob/d057b39d93014fe40f5067ee4a33621e65c91552/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java#L249-L265 + return fmt.Sprintf("DECIMAL(%d,%d)", t.Precision, t.Scale) } // Time units for logical types. diff --git a/internal/quick/quick.go b/internal/quick/quick.go index 2bb51f23..188cb8b7 100644 --- a/internal/quick/quick.go +++ b/internal/quick/quick.go @@ -6,6 +6,7 @@ import ( "math/rand" "reflect" "strings" + "time" ) var DefaultConfig = Config{ @@ -70,6 +71,18 @@ func (c *Config) Check(f interface{}) error { type MakeValueFunc func(reflect.Value, *rand.Rand) func MakeValueFuncOf(t reflect.Type) MakeValueFunc { + switch t { + case reflect.TypeOf(time.Time{}): + return func(v reflect.Value, r *rand.Rand) { + // TODO: This is a hack to support the matching of times in a precision + // other than nanosecond by generating times rounded to the second. A + // better solution would be to update columns types to add a compare + // function. + sec := r.Int63n(2524608000) // 2050-01-01 + v.Set(reflect.ValueOf(time.Unix(sec, 0).UTC())) + } + } + switch t.Kind() { case reflect.Bool: return func(v reflect.Value, r *rand.Rand) { diff --git a/parquet_test.go b/parquet_test.go index 88615f07..0671c7f0 100644 --- a/parquet_test.go +++ b/parquet_test.go @@ -136,6 +136,24 @@ func (row uuidColumn) generate(prng *rand.Rand) uuidColumn { return row } +type timeColumn struct { + Value time.Time +} + +func (row timeColumn) generate(prng *rand.Rand) timeColumn { + t := time.Unix(0, prng.Int63()).UTC() + return timeColumn{Value: t} +} + +type timeInMillisColumn struct { + Value time.Time `parquet:",timestamp(millisecond)"` +} + +func (row timeInMillisColumn) generate(prng *rand.Rand) timeInMillisColumn { + t := time.Unix(0, prng.Int63()).UTC() + return timeInMillisColumn{Value: t} +} + type decimalColumn struct { Value int64 `parquet:",decimal(0:3)"` } diff --git a/print_test.go b/print_test.go index eae29824..6f535f4f 100644 --- a/print_test.go +++ b/print_test.go @@ -141,14 +141,14 @@ func TestPrintSchema(t *testing.T) { { node: parquet.Group{"cost": parquet.Decimal(0, 9, parquet.Int32Type)}, print: `message Test { - required int32 cost (DECIMAL(0,9)); + required int32 cost (DECIMAL(9,0)); }`, }, { node: parquet.Group{"cost": parquet.Decimal(0, 18, parquet.Int64Type)}, print: `message Test { - required int64 cost (DECIMAL(0,18)); + required int64 cost (DECIMAL(18,0)); }`, }, diff --git a/reader_go18_test.go b/reader_go18_test.go index 64c3bc3b..191efc67 100644 --- a/reader_go18_test.go +++ b/reader_go18_test.go @@ -26,6 +26,8 @@ func TestGenericReader(t *testing.T) { testGenericReader[stringColumn](t) testGenericReader[indexedStringColumn](t) testGenericReader[uuidColumn](t) + testGenericReader[timeColumn](t) + testGenericReader[timeInMillisColumn](t) testGenericReader[mapColumn](t) testGenericReader[decimalColumn](t) testGenericReader[addressBook](t) @@ -98,6 +100,8 @@ func BenchmarkGenericReader(b *testing.B) { benchmarkGenericReader[stringColumn](b) benchmarkGenericReader[indexedStringColumn](b) benchmarkGenericReader[uuidColumn](b) + benchmarkGenericReader[timeColumn](b) + benchmarkGenericReader[timeInMillisColumn](b) benchmarkGenericReader[mapColumn](b) benchmarkGenericReader[decimalColumn](b) benchmarkGenericReader[contact](b) diff --git a/reader_test.go b/reader_test.go index ddd5f481..fdb4aca8 100644 --- a/reader_test.go +++ b/reader_test.go @@ -89,6 +89,16 @@ var readerTests = []struct { model: uuidColumn{}, }, + { + scenario: "time.Time", + model: timeColumn{}, + }, + + { + scenario: "time.Time in ms", + model: timeInMillisColumn{}, + }, + { scenario: "DECIMAL", model: decimalColumn{}, diff --git a/row.go b/row.go index 602fdb06..e1e2b5d8 100644 --- a/row.go +++ b/row.go @@ -499,13 +499,15 @@ func deconstructFuncOfLeaf(columnIndex int16, node Node) (int16, deconstructFunc if columnIndex > MaxColumnIndex { panic("row cannot be deconstructed because it has more than 127 columns") } - kind := node.Type().Kind() + typ := node.Type() + kind := typ.Kind() + lt := typ.LogicalType() valueColumnIndex := ^columnIndex return columnIndex + 1, func(row Row, levels levels, value reflect.Value) Row { v := Value{} if value.IsValid() { - v = makeValue(kind, value) + v = makeValue(kind, lt, value) } v.repetitionLevel = levels.repetitionLevel diff --git a/schema.go b/schema.go index 8828219d..8a90c9bc 100644 --- a/schema.go +++ b/schema.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/google/uuid" "github.com/segmentio/parquet-go/compress" @@ -496,6 +497,8 @@ func nodeOf(t reflect.Type, tag []string) Node { return Leaf(Int96Type) case reflect.TypeOf(uuid.UUID{}): return UUID() + case reflect.TypeOf(time.Time{}): + return Timestamp(Nanosecond) } var n Node @@ -831,7 +834,16 @@ func makeNodeOf(t reflect.Type, name string, tag []string) Node { } setNode(Timestamp(timeUnit)) default: - throwInvalidTag(t, name, option) + switch t { + case reflect.TypeOf(time.Time{}): + timeUnit, err := parseTimestampArgs(args) + if err != nil { + throwInvalidTag(t, name, option) + } + setNode(Timestamp(timeUnit)) + default: + throwInvalidTag(t, name, option) + } } default: throwUnknownTag(t, name, option) diff --git a/sparse/array.go b/sparse/array.go index 98485899..94285bec 100644 --- a/sparse/array.go +++ b/sparse/array.go @@ -1,6 +1,9 @@ package sparse -import "unsafe" +import ( + "time" + "unsafe" +) type Array struct{ array } @@ -25,6 +28,7 @@ func (a Array) Uint32Array() Uint32Array { return Uint32Array{a.array} } func (a Array) Uint64Array() Uint64Array { return Uint64Array{a.array} } func (a Array) Uint128Array() Uint128Array { return Uint128Array{a.array} } func (a Array) StringArray() StringArray { return StringArray{a.array} } +func (a Array) TimeArray() TimeArray { return TimeArray{a.array} } type array struct { ptr unsafe.Pointer @@ -290,3 +294,19 @@ func (a StringArray) Len() int { return int(a.len) } func (a StringArray) Index(i int) string { return *(*string)(a.index(i)) } func (a StringArray) Slice(i, j int) StringArray { return StringArray{a.slice(i, j)} } func (a StringArray) UnsafeArray() Array { return Array{a.array} } + +type TimeArray struct{ array } + +func MakeTimeArray(values []time.Time) TimeArray { + const sizeOfTime = unsafe.Sizeof(time.Time{}) + return TimeArray{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), sizeOfTime)} +} + +func UnsafeTimeArray(base unsafe.Pointer, length int, offset uintptr) TimeArray { + return TimeArray{makeArray(base, uintptr(length), offset)} +} + +func (a TimeArray) Len() int { return int(a.len) } +func (a TimeArray) Index(i int) time.Time { return *(*time.Time)(a.index(i)) } +func (a TimeArray) Slice(i, j int) TimeArray { return TimeArray{a.slice(i, j)} } +func (a TimeArray) UnsafeArray() Array { return Array{a.array} } diff --git a/testdata/empty.parquet b/testdata/empty.parquet new file mode 100644 index 00000000..e36cbef7 Binary files /dev/null and b/testdata/empty.parquet differ diff --git a/type.go b/type.go index f996f11f..e997c98e 100644 --- a/type.go +++ b/type.go @@ -1722,11 +1722,57 @@ func (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Enc } func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { - return Int64Type.AssignValue(dst, src) + switch dst.Type() { + case reflect.TypeOf(time.Time{}): + unit := Nanosecond.TimeUnit() + lt := t.LogicalType() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + nanos := src.Int64() + switch { + case unit.Millis != nil: + nanos = nanos * 1e6 + case unit.Micros != nil: + nanos = nanos * 1e3 + } + + val := time.Unix(0, nanos).UTC() + dst.Set(reflect.ValueOf(val)) + return nil + default: + return Int64Type.AssignValue(dst, src) + } } func (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) { - return Int64Type.ConvertValue(val, typ) + var sourceTs *format.TimestampType + if typ.LogicalType() != nil { + sourceTs = typ.LogicalType().Timestamp + } + + // Ignore when source is not a timestamp (i.e., Integer) + if sourceTs == nil { + return val, nil + } + + source := timeUnitDuration(sourceTs.Unit) + target := timeUnitDuration(t.Unit) + converted := val.Int64() * source.Nanoseconds() / target.Nanoseconds() + + return ValueOf(converted), nil +} + +func timeUnitDuration(unit format.TimeUnit) time.Duration { + switch { + case unit.Millis != nil: + return time.Millisecond + case unit.Micros != nil: + return time.Microsecond + default: + return time.Nanosecond + } } // List constructs a node of LIST logical type. diff --git a/value.go b/value.go index d3ece01d..5cf00530 100644 --- a/value.go +++ b/value.go @@ -8,10 +8,12 @@ import ( "math" "reflect" "strconv" + "time" "unsafe" "github.com/google/uuid" "github.com/segmentio/parquet-go/deprecated" + "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/unsafecast" ) @@ -158,6 +160,9 @@ func copyValues(dst ValueWriter, src ValueReader, buf []Value) (written int64, e // // The function panics if the Go value cannot be represented in parquet. func ValueOf(v interface{}) Value { + k := Kind(-1) + t := reflect.TypeOf(v) + switch value := v.(type) { case nil: return Value{} @@ -165,11 +170,10 @@ func ValueOf(v interface{}) Value { return makeValueBytes(FixedLenByteArray, value[:]) case deprecated.Int96: return makeValueInt96(value) + case time.Time: + k = Int64 } - k := Kind(-1) - t := reflect.TypeOf(v) - switch t.Kind() { case reflect.Bool: k = Boolean @@ -197,10 +201,30 @@ func ValueOf(v interface{}) Value { panic("cannot create parquet value from go value of type " + t.String()) } - return makeValue(k, reflect.ValueOf(v)) + return makeValue(k, nil, reflect.ValueOf(v)) } -func makeValue(k Kind, v reflect.Value) Value { +func makeValue(k Kind, lt *format.LogicalType, v reflect.Value) Value { + switch v.Type() { + case reflect.TypeOf(time.Time{}): + unit := Nanosecond.TimeUnit() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + t := v.Interface().(time.Time) + var val int64 + switch { + case unit.Millis != nil: + val = t.UnixMilli() + case unit.Micros != nil: + val = t.UnixMicro() + default: + val = t.UnixNano() + } + return makeValueInt64(val) + } + switch k { case Boolean: return makeValueBoolean(v.Bool()) diff --git a/value_test.go b/value_test.go index db9d7dda..b5a29893 100644 --- a/value_test.go +++ b/value_test.go @@ -4,6 +4,7 @@ import ( "bytes" "math" "testing" + "time" "unsafe" "github.com/segmentio/parquet-go" @@ -68,6 +69,14 @@ func TestValueClone(t *testing.T) { scenario: "FIXED_LEN_BYTE_ARRAY", values: []interface{}{[1]byte{42}, [16]byte{0: 1}}, }, + + { + scenario: "TIME", + values: []interface{}{ + time.Date(2020, 1, 2, 3, 4, 5, 7, time.UTC), + time.Date(2021, 2, 3, 4, 5, 6, 8, time.UTC), + }, + }, } for _, test := range tests { diff --git a/writer_go18_test.go b/writer_go18_test.go index 3233369e..fb7a1aed 100644 --- a/writer_go18_test.go +++ b/writer_go18_test.go @@ -24,6 +24,8 @@ func BenchmarkGenericWriter(b *testing.B) { benchmarkGenericWriter[stringColumn](b) benchmarkGenericWriter[indexedStringColumn](b) benchmarkGenericWriter[uuidColumn](b) + benchmarkGenericWriter[timeColumn](b) + benchmarkGenericWriter[timeInMillisColumn](b) benchmarkGenericWriter[mapColumn](b) benchmarkGenericWriter[decimalColumn](b) benchmarkGenericWriter[contact](b)