From dd468dc8599106b9772120d8159a6a1b8bc5a2c5 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Fri, 5 Feb 2021 00:09:32 -0800 Subject: [PATCH 1/5] Add Cython bindings for arrow::ArrayBuilder classes --- bindings/python/pymongoarrow/builder.pyx | 88 ++++++++++++++++++++++++ bindings/python/setup.py | 23 +++++-- bindings/python/test/test_builder.py | 45 ++++++++++++ 3 files changed, 151 insertions(+), 5 deletions(-) create mode 100644 bindings/python/pymongoarrow/builder.pyx create mode 100644 bindings/python/test/test_builder.py diff --git a/bindings/python/pymongoarrow/builder.pyx b/bindings/python/pymongoarrow/builder.pyx new file mode 100644 index 00000000..a3f08a2d --- /dev/null +++ b/bindings/python/pymongoarrow/builder.pyx @@ -0,0 +1,88 @@ +# Copyright 2021-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cython compiler directives +# distutils: language=c++ +# cython: language_level=3 +from pyarrow.lib cimport * + +import numpy as np + + +cdef class Int32Builder(_Weakrefable): + cdef: + unique_ptr[CInt32Builder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CInt32Builder(pool)) + + def append(self, value): + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, int): + self.builder.get().Append(value) + else: + raise TypeError('Int32Builder only accepts integer objects') + + def append_values(self, values): + for value in values: + self.append(value) + + def finish(self): + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() + + +cdef class Int64Builder(_Weakrefable): + cdef: + unique_ptr[CInt64Builder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CInt64Builder(pool)) + + def append(self, value): + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, int): + self.builder.get().Append(value) + else: + raise TypeError('Int64Builder only accepts integer objects') + + def append_values(self, values): + for value in values: + self.append(value) + + def finish(self): + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 06fbc478..c0bce8ef 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -3,6 +3,9 @@ import os +import numpy as np +import pyarrow as pa + def get_pymongoarrow_version(): """Single source the version.""" @@ -15,11 +18,21 @@ def get_pymongoarrow_version(): def get_extension_modules(): - modules = cythonize(['pymongoarrow/*.pyx', - 'pymongoarrow/libbson/*.pyx']) - for module in modules: + arrow_modules = cythonize(['pymongoarrow/*.pyx']) + libbson_modules = cythonize(['pymongoarrow/libbson/*.pyx']) + + for module in libbson_modules: module.libraries.append('bson-1.0') - return modules + + for module in arrow_modules: + module.include_dirs.append(np.get_include()) + module.include_dirs.append(pa.get_include()) + module.libraries.extend(pa.get_libraries()) + module.library_dirs.extend(pa.get_library_dirs()) + if os.name == 'posix': + module.extra_compile_args.append('-std=c++11') + + return arrow_modules + libbson_modules setup( @@ -27,4 +40,4 @@ def get_extension_modules(): version=get_pymongoarrow_version(), packages=find_packages(), ext_modules=get_extension_modules(), - setup_requires=['cython >= 0.29']) + setup_requires=['cython >= 0.29', 'pyarrow >= 3', 'numpy >= 1.16.6']) diff --git a/bindings/python/test/test_builder.py b/bindings/python/test/test_builder.py new file mode 100644 index 00000000..16a19304 --- /dev/null +++ b/bindings/python/test/test_builder.py @@ -0,0 +1,45 @@ +# Copyright 2021-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import TestCase + +from pyarrow import Array, int32, int64 + +from pymongoarrow.builder import Int32Builder, Int64Builder + + +class TestIntBuildersMixin: + def test_simple(self): + builder = self.builder_cls() + builder.append(0) + builder.append_values([1, 2, 3, 4]) + arr = builder.finish() + + self.assertIsInstance(arr, Array) + self.assertEqual(arr.null_count, 0) + self.assertEqual(len(arr), 5) + self.assertEqual( + arr.to_pylist(), [0, 1, 2, 3, 4]) + self.assertEqual(arr.type, self.data_type) + + +class TestInt32Builder(TestCase, TestIntBuildersMixin): + def setUp(self): + self.builder_cls = Int32Builder + self.data_type = int32() + + +class TestInt64Builder(TestCase, TestIntBuildersMixin): + def setUp(self): + self.builder_cls = Int64Builder + self.data_type = int64() From 37dbf12320a0c6a361f18def8b4d345becf96005 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Fri, 5 Feb 2021 01:13:26 -0800 Subject: [PATCH 2/5] add methods to unwrap classes --- bindings/python/pymongoarrow/builder.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bindings/python/pymongoarrow/builder.pyx b/bindings/python/pymongoarrow/builder.pyx index a3f08a2d..21a571bc 100644 --- a/bindings/python/pymongoarrow/builder.pyx +++ b/bindings/python/pymongoarrow/builder.pyx @@ -22,7 +22,7 @@ import numpy as np cdef class Int32Builder(_Weakrefable): cdef: - unique_ptr[CInt32Builder] builder + shared_ptr[CInt32Builder] builder def __cinit__(self, MemoryPool memory_pool=None): cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -53,10 +53,13 @@ cdef class Int32Builder(_Weakrefable): def __len__(self): return self.builder.get().length() + cdef shared_ptr[CInt32Builder] unwrap(self): + return self.builder + cdef class Int64Builder(_Weakrefable): cdef: - unique_ptr[CInt64Builder] builder + shared_ptr[CInt64Builder] builder def __cinit__(self, MemoryPool memory_pool=None): cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -86,3 +89,6 @@ cdef class Int64Builder(_Weakrefable): def __len__(self): return self.builder.get().length() + + cdef shared_ptr[CInt64Builder] unwrap(self): + return self.builder From 92049494018c7a4a0d33abea1407248826ff62ed Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Mon, 8 Feb 2021 13:31:05 -0800 Subject: [PATCH 3/5] add support for more types --- bindings/python/pymongoarrow/builder.pyx | 105 ++++++++++++++++++----- bindings/python/setup.py | 3 + bindings/python/test/test_builder.py | 56 ++++++++++-- 3 files changed, 137 insertions(+), 27 deletions(-) diff --git a/bindings/python/pymongoarrow/builder.pyx b/bindings/python/pymongoarrow/builder.pyx index 21a571bc..c5b4b7e3 100644 --- a/bindings/python/pymongoarrow/builder.pyx +++ b/bindings/python/pymongoarrow/builder.pyx @@ -15,12 +15,30 @@ # Cython compiler directives # distutils: language=c++ # cython: language_level=3 +from datetime import datetime + from pyarrow.lib cimport * import numpy as np +from pyarrow import timestamp + +from pymongoarrow.utils import datetime_to_int64 + + +cdef class _BuilderBase: + def append_values(self, values): + for value in values: + self.append(value) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() -cdef class Int32Builder(_Weakrefable): +cdef class Int32Builder(_BuilderBase): cdef: shared_ptr[CInt32Builder] builder @@ -36,28 +54,17 @@ cdef class Int32Builder(_Weakrefable): else: raise TypeError('Int32Builder only accepts integer objects') - def append_values(self, values): - for value in values: - self.append(value) - def finish(self): cdef shared_ptr[CArray] out with nogil: self.builder.get().Finish(&out) return pyarrow_wrap_array(out) - @property - def null_count(self): - return self.builder.get().null_count() - - def __len__(self): - return self.builder.get().length() - cdef shared_ptr[CInt32Builder] unwrap(self): return self.builder -cdef class Int64Builder(_Weakrefable): +cdef class Int64Builder(_BuilderBase): cdef: shared_ptr[CInt64Builder] builder @@ -73,9 +80,31 @@ cdef class Int64Builder(_Weakrefable): else: raise TypeError('Int64Builder only accepts integer objects') - def append_values(self, values): - for value in values: - self.append(value) + def finish(self): + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + cdef shared_ptr[CInt64Builder] unwrap(self): + return self.builder + + +cdef class DoubleBuilder(_BuilderBase): + cdef: + shared_ptr[CDoubleBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CDoubleBuilder(pool)) + + def append(self, value): + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, (int, float)): + self.builder.get().Append(value) + else: + raise TypeError('DoubleBuilder only accepts floats and ints') def finish(self): cdef shared_ptr[CArray] out @@ -83,12 +112,44 @@ cdef class Int64Builder(_Weakrefable): self.builder.get().Finish(&out) return pyarrow_wrap_array(out) - @property - def null_count(self): - return self.builder.get().null_count() + cdef shared_ptr[CDoubleBuilder] unwrap(self): + return self.builder - def __len__(self): - return self.builder.get().length() - cdef shared_ptr[CInt64Builder] unwrap(self): +cdef class DatetimeBuilder(_BuilderBase): + cdef: + shared_ptr[CTimestampBuilder] builder + TimestampType dtype + + def __cinit__(self, TimestampType dtype=timestamp('ms'), + MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + if dtype in (timestamp('us'), timestamp('ns')): + raise ValueError("Microsecond resolution temporal type is not " + "suitable for use with MongoDB's UTC datetime " + "type which has resolution of milliseconds.") + self.dtype = dtype + self.builder.reset(new CTimestampBuilder( + pyarrow_unwrap_data_type(self.dtype), pool)) + + def append(self, value): + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, datetime): + self.builder.get().Append( + datetime_to_int64(value, self.dtype)) + else: + raise TypeError('TimestampBuilder only accepts datetime objects') + + def finish(self): + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def unit(self): + return self.dtype + + cdef shared_ptr[CTimestampBuilder] unwrap(self): return self.builder diff --git a/bindings/python/setup.py b/bindings/python/setup.py index c0bce8ef..473a1902 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -29,6 +29,8 @@ def get_extension_modules(): module.include_dirs.append(pa.get_include()) module.libraries.extend(pa.get_libraries()) module.library_dirs.extend(pa.get_library_dirs()) + + # https://arrow.apache.org/docs/python/extending.html#example if os.name == 'posix': module.extra_compile_args.append('-std=c++11') @@ -40,4 +42,5 @@ def get_extension_modules(): version=get_pymongoarrow_version(), packages=find_packages(), ext_modules=get_extension_modules(), + install_requires=['pyarrow >= 3', 'pymongo >= 3.11,<4'], setup_requires=['cython >= 0.29', 'pyarrow >= 3', 'numpy >= 1.16.6']) diff --git a/bindings/python/test/test_builder.py b/bindings/python/test/test_builder.py index 16a19304..8d9a80f7 100644 --- a/bindings/python/test/test_builder.py +++ b/bindings/python/test/test_builder.py @@ -11,11 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime, timedelta from unittest import TestCase -from pyarrow import Array, int32, int64 +from pyarrow import Array, timestamp, int32, int64 -from pymongoarrow.builder import Int32Builder, Int64Builder +from pymongoarrow.builder import ( + DatetimeBuilder, DoubleBuilder, Int32Builder, Int64Builder) class TestIntBuildersMixin: @@ -23,13 +25,14 @@ def test_simple(self): builder = self.builder_cls() builder.append(0) builder.append_values([1, 2, 3, 4]) + builder.append(None) arr = builder.finish() self.assertIsInstance(arr, Array) - self.assertEqual(arr.null_count, 0) - self.assertEqual(len(arr), 5) + self.assertEqual(arr.null_count, 1) + self.assertEqual(len(arr), 6) self.assertEqual( - arr.to_pylist(), [0, 1, 2, 3, 4]) + arr.to_pylist(), [0, 1, 2, 3, 4, None]) self.assertEqual(arr.type, self.data_type) @@ -43,3 +46,46 @@ class TestInt64Builder(TestCase, TestIntBuildersMixin): def setUp(self): self.builder_cls = Int64Builder self.data_type = int64() + + +class TestDate64Builder(TestCase): + def test_simple(self): + # Check default unit + builder = DatetimeBuilder() + self.assertEqual(builder.unit, timestamp('ms')) + + # Milliseconds + datetimes = [datetime(1970, 1, 1) + timedelta(milliseconds=k*100) + for k in range(5)] + builder.append(datetimes[0]) + builder.append_values(datetimes[1:]) + builder.append(None) + arr = builder.finish() + + self.assertIsInstance(arr, Array) + self.assertEqual(arr.null_count, 1) + self.assertEqual(len(arr), len(datetimes) + 1) + self.assertEqual(arr.to_pylist(), datetimes + [None]) + self.assertEqual(arr.type, timestamp('ms')) + + def test_unsupported_units(self): + with self.assertRaises(ValueError): + DatetimeBuilder(dtype=timestamp('us')) + + with self.assertRaises(ValueError): + DatetimeBuilder(dtype=timestamp('ns')) + + +class TestDoubleBuilder(TestCase): + def test_simple(self): + builder = DoubleBuilder() + builder.append(0.123) + builder.append_values([1.234, 2.345, 3.456, 4.567]) + builder.append(None) + arr = builder.finish() + + self.assertIsInstance(arr, Array) + self.assertEqual(arr.null_count, 1) + self.assertEqual(len(arr), 6) + self.assertEqual( + arr.to_pylist(), [0.123, 1.234, 2.345, 3.456, 4.567, None]) From a90ebc8eafbf518956d03692b4f2509f1f821277 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Mon, 8 Feb 2021 13:54:08 -0800 Subject: [PATCH 4/5] reorg code --- .../{builder.pyx => builders.pyi} | 11 +----- bindings/python/pymongoarrow/lib.pyx | 34 +++++++++++++++++++ bindings/python/pymongoarrow/utils.pyi | 34 +++++++++++++++++++ bindings/python/test/__init__.py | 13 +++++++ .../{test_builder.py => test_builders.py} | 2 +- 5 files changed, 83 insertions(+), 11 deletions(-) rename bindings/python/pymongoarrow/{builder.pyx => builders.pyi} (95%) create mode 100644 bindings/python/pymongoarrow/lib.pyx create mode 100644 bindings/python/pymongoarrow/utils.pyi create mode 100644 bindings/python/test/__init__.py rename bindings/python/test/{test_builder.py => test_builders.py} (98%) diff --git a/bindings/python/pymongoarrow/builder.pyx b/bindings/python/pymongoarrow/builders.pyi similarity index 95% rename from bindings/python/pymongoarrow/builder.pyx rename to bindings/python/pymongoarrow/builders.pyi index c5b4b7e3..ef1eb83a 100644 --- a/bindings/python/pymongoarrow/builder.pyx +++ b/bindings/python/pymongoarrow/builders.pyi @@ -15,15 +15,6 @@ # Cython compiler directives # distutils: language=c++ # cython: language_level=3 -from datetime import datetime - -from pyarrow.lib cimport * - -import numpy as np -from pyarrow import timestamp - -from pymongoarrow.utils import datetime_to_int64 - cdef class _BuilderBase: def append_values(self, values): @@ -135,7 +126,7 @@ cdef class DatetimeBuilder(_BuilderBase): def append(self, value): if value is None or value is np.nan: self.builder.get().AppendNull() - elif isinstance(value, datetime): + elif isinstance(value, datetime.datetime): self.builder.get().Append( datetime_to_int64(value, self.dtype)) else: diff --git a/bindings/python/pymongoarrow/lib.pyx b/bindings/python/pymongoarrow/lib.pyx new file mode 100644 index 00000000..56c7a690 --- /dev/null +++ b/bindings/python/pymongoarrow/lib.pyx @@ -0,0 +1,34 @@ +# Copyright 2021-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cython compiler directives +# distutils: language=c++ +# cython: language_level=3 + +# Stdlib imports +import datetime + +# Python imports +import numpy as np +from pyarrow import timestamp + +# Cython imports +from pyarrow.lib cimport * + + +# Utilities +include "utils.pyi" + +# Builders +include "builders.pyi" diff --git a/bindings/python/pymongoarrow/utils.pyi b/bindings/python/pymongoarrow/utils.pyi new file mode 100644 index 00000000..1a07a7bc --- /dev/null +++ b/bindings/python/pymongoarrow/utils.pyi @@ -0,0 +1,34 @@ +# Copyright 2021-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def datetime_to_int64(dtm, data_type): + # TODO: rewrite as a cdef which directly accesses data_type as a CTimestampType instance + # TODO: make this function aware of datatype.timezone() + total_seconds = int((dtm - datetime.datetime(1970, 1, 1)).total_seconds()) + total_microseconds = int(total_seconds) * 10**6 + dtm.microsecond + + if data_type.unit == 's': + factor = 1. + elif data_type.unit == 'ms': + factor = 10. ** 3 + elif data_type.unit == 'us': + factor = 10. ** 6 + elif data_type.unit == 'ns': + factor = 10. ** 9 + else: + raise ValueError('Unsupported timestamp unit {}'.format( + data_type.unit)) + + int64_t = int(total_microseconds * factor / (10. ** 6)) + return int64_t diff --git a/bindings/python/test/__init__.py b/bindings/python/test/__init__.py new file mode 100644 index 00000000..880e7f4a --- /dev/null +++ b/bindings/python/test/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2021-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bindings/python/test/test_builder.py b/bindings/python/test/test_builders.py similarity index 98% rename from bindings/python/test/test_builder.py rename to bindings/python/test/test_builders.py index 8d9a80f7..55e65aa4 100644 --- a/bindings/python/test/test_builder.py +++ b/bindings/python/test/test_builders.py @@ -16,7 +16,7 @@ from pyarrow import Array, timestamp, int32, int64 -from pymongoarrow.builder import ( +from pymongoarrow.lib import ( DatetimeBuilder, DoubleBuilder, Int32Builder, Int64Builder) From ac9a78eca39cb208436bb8681f75abb3a2a495f4 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Mon, 8 Feb 2021 15:28:52 -0800 Subject: [PATCH 5/5] test seconds --- bindings/python/test/test_builders.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bindings/python/test/test_builders.py b/bindings/python/test/test_builders.py index 55e65aa4..43c16af5 100644 --- a/bindings/python/test/test_builders.py +++ b/bindings/python/test/test_builders.py @@ -49,13 +49,14 @@ def setUp(self): class TestDate64Builder(TestCase): - def test_simple(self): + def test_default_unit(self): # Check default unit builder = DatetimeBuilder() self.assertEqual(builder.unit, timestamp('ms')) - # Milliseconds - datetimes = [datetime(1970, 1, 1) + timedelta(milliseconds=k*100) + def _test_simple(self, tstamp_units, kwarg_name): + builder = DatetimeBuilder(dtype=timestamp(tstamp_units)) + datetimes = [datetime(1970, 1, 1) + timedelta(**{kwarg_name: k*100}) for k in range(5)] builder.append(datetimes[0]) builder.append_values(datetimes[1:]) @@ -66,7 +67,13 @@ def test_simple(self): self.assertEqual(arr.null_count, 1) self.assertEqual(len(arr), len(datetimes) + 1) self.assertEqual(arr.to_pylist(), datetimes + [None]) - self.assertEqual(arr.type, timestamp('ms')) + self.assertEqual(arr.type, timestamp(tstamp_units)) + + def test_simple(self): + # milliseconds + self._test_simple('ms', 'milliseconds') + # seconds + self._test_simple('s', 'seconds') def test_unsupported_units(self): with self.assertRaises(ValueError):