diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index b9e875ceebc74..ee92cebcb549c 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -68,34 +68,43 @@ message). See the :ref:`format_metadata_extension_types` section of the metadata specification for more details. -Pyarrow allows you to define such extension types from Python. - -There are currently two ways: - -* Subclassing :class:`PyExtensionType`: the (de)serialization is based on pickle. - This is a good option for an extension type that is only used from Python. -* Subclassing :class:`ExtensionType`: this allows to give a custom - Python-independent name and serialized metadata, that can potentially be - recognized by other (non-Python) Arrow implementations such as PySpark. +Pyarrow allows you to define such extension types from Python by subclassing +:class:`ExtensionType` and giving the derived class its own extension name +and serialization mechanism. The extension name and serialized metadata +can potentially be recognized by other (non-Python) Arrow implementations +such as PySpark. For example, we could define a custom UUID type for 128-bit numbers which can -be represented as ``FixedSizeBinary`` type with 16 bytes. -Using the first approach, we create a ``UuidType`` subclass, and implement the -``__reduce__`` method to ensure the class can be properly pickled:: +be represented as ``FixedSizeBinary`` type with 16 bytes:: - class UuidType(pa.PyExtensionType): + class UuidType(pa.ExtensionType): def __init__(self): - pa.PyExtensionType.__init__(self, pa.binary(16)) + super().__init__(pa.binary(16), "my_package.uuid") + + def __arrow_ext_serialize__(self): + # Since we don't have a parameterized type, we don't need extra + # metadata to be deserialized + return b'' - def __reduce__(self): - return UuidType, () + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + # Sanity checks, not required but illustrate the method signature. + assert storage_type == pa.binary(16) + assert serialized == b'' + # Return an instance of this subclass given the serialized + # metadata. + return UuidType() + +The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` +define the serialization of an extension type instance. For non-parametric +types such as the above, the serialization payload can be left empty. This can now be used to create arrays and tables holding the extension type:: >>> uuid_type = UuidType() >>> uuid_type.extension_name - 'arrow.py_extension_type' + 'my_package.uuid' >>> uuid_type.storage_type FixedSizeBinaryType(fixed_size_binary[16]) @@ -112,8 +121,11 @@ This can now be used to create arrays and tables holding the extension type:: ] This array can be included in RecordBatches, sent over IPC and received in -another Python process. The custom UUID type will be preserved there, as long -as the definition of the class is available (the type can be unpickled). +another Python process. The receiving process must explicitly register the +extension type for deserialization, otherwise it will fall back to the +storage type:: + + >>> pa.register_extension_type(UuidType()) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -129,43 +141,12 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() >>> result.column('ext').type - UuidType(extension) - -We can define the same type using the other option:: - - class UuidType(pa.ExtensionType): - - def __init__(self): - pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - - def __arrow_ext_serialize__(self): - # since we don't have a parameterized type, we don't need extra - # metadata to be deserialized - return b'' - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - # return an instance of this subclass given the serialized - # metadata. - return UuidType() - -This is a slightly longer implementation (you need to implement the special -methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__``), and the -extension type needs to be registered to be received through IPC (using -:func:`register_extension_type`), but it has -now a unique name:: - - >>> uuid_type = UuidType() - >>> uuid_type.extension_name - 'my_package.uuid' - - >>> pa.register_extension_type(uuid_type) + UuidType(FixedSizeBinaryType(fixed_size_binary[16])) The receiving application doesn't need to be Python but can still recognize -the extension type as a "uuid" type, if it has implemented its own extension -type to receive it. -If the type is not registered in the receiving application, it will fall back -to the storage type. +the extension type as a "my_package.uuid" type, if it has implemented its own +extension type to receive it. If the type is not registered in the receiving +application, it will fall back to the storage type. Parameterized extension type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -187,7 +168,7 @@ of the given frequency since 1970. # attributes need to be set first before calling # super init (as that calls serialize) self._freq = freq - pa.ExtensionType.__init__(self, pa.int64(), 'my_package.period') + super().__init__(pa.int64(), 'my_package.period') @property def freq(self): @@ -198,7 +179,7 @@ of the given frequency since 1970. @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): - # return an instance of this subclass given the serialized + # Return an instance of this subclass given the serialized # metadata. serialized = serialized.decode() assert serialized.startswith("freq=") @@ -209,31 +190,10 @@ Here, we ensure to store all information in the serialized metadata that is needed to reconstruct the instance (in the ``__arrow_ext_deserialize__`` class method), in this case the frequency string. -Note that, once created, the data type instance is considered immutable. If, -in the example above, the ``freq`` parameter would change after instantiation, -the reconstruction of the type instance after IPC will be incorrect. +Note that, once created, the data type instance is considered immutable. In the example above, the ``freq`` parameter is therefore stored in a private attribute with a public read-only property to access it. -Parameterized extension types are also possible using the pickle-based type -subclassing :class:`PyExtensionType`. The equivalent example for the period -data type from above would look like:: - - class PeriodType(pa.PyExtensionType): - - def __init__(self, freq): - self._freq = freq - pa.PyExtensionType.__init__(self, pa.int64()) - - @property - def freq(self): - return self._freq - - def __reduce__(self): - return PeriodType, (self.freq,) - -Also the storage type does not need to be fixed but can be parameterized. - Custom extension array class ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -252,12 +212,16 @@ the data as a 2-D Numpy array ``(N, 3)`` without any copy:: return self.storage.flatten().to_numpy().reshape((-1, 3)) - class Point3DType(pa.PyExtensionType): + class Point3DType(pa.ExtensionType): def __init__(self): - pa.PyExtensionType.__init__(self, pa.list_(pa.float32(), 3)) + super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") - def __reduce__(self): - return Point3DType, () + def __arrow_ext_serialize__(self): + return b'' + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + return Point3DType() def __arrow_ext_class__(self): return Point3DArray @@ -289,11 +253,8 @@ The additional methods in the extension class are then available to the user:: This array can be sent over IPC, received in another Python process, and the custom -extension array class will be preserved (as long as the definitions of the classes above -are available). - -The same ``__arrow_ext_class__`` specialization can be used with custom types defined -by subclassing :class:`ExtensionType`. +extension array class will be preserved (as long as the receiving process registers +the extension type using :func:`register_extension_type` before reading the IPC data). Custom scalar conversion ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -304,18 +265,24 @@ If you want scalars of your custom extension type to convert to a custom type wh For example, if we wanted the above example 3D point type to return a custom 3D point class instead of a list, we would implement:: + from collections import namedtuple + Point3D = namedtuple("Point3D", ["x", "y", "z"]) class Point3DScalar(pa.ExtensionScalar): def as_py(self) -> Point3D: return Point3D(*self.value.as_py()) - class Point3DType(pa.PyExtensionType): + class Point3DType(pa.ExtensionType): def __init__(self): - pa.PyExtensionType.__init__(self, pa.list_(pa.float32(), 3)) + super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") - def __reduce__(self): - return Point3DType, () + def __arrow_ext_serialize__(self): + return b'' + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + return Point3DType() def __arrow_ext_scalar_class__(self): return Point3DScalar diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 06832d2af97c4..a0ddf09d69423 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1437,7 +1437,10 @@ cdef class ExtensionType(BaseExtensionType): Parameters ---------- storage_type : DataType + The underlying storage type for the extension type. extension_name : str + A unique name distinguishing this extension type. The name will be + used when deserializing IPC data. Examples -------- @@ -1679,55 +1682,14 @@ cdef class PyExtensionType(ExtensionType): Concrete base class for Python-defined extension types based on pickle for (de)serialization. + .. warning:: + This class is deprecated and its deserialization is disabled by default. + :class:`ExtensionType` is recommended instead. + Parameters ---------- storage_type : DataType The storage type for which the extension is built. - - Examples - -------- - Define a UuidType extension type subclassing PyExtensionType: - - >>> import pyarrow as pa - >>> class UuidType(pa.PyExtensionType): - ... def __init__(self): - ... pa.PyExtensionType.__init__(self, pa.binary(16)) - ... def __reduce__(self): - ... return UuidType, () - ... - - Create an instance of UuidType extension type: - - >>> uuid_type = UuidType() # doctest: +SKIP - >>> uuid_type # doctest: +SKIP - UuidType(FixedSizeBinaryType(fixed_size_binary[16])) - - Inspect the extension type: - - >>> uuid_type.extension_name # doctest: +SKIP - 'arrow.py_extension_type' - >>> uuid_type.storage_type # doctest: +SKIP - FixedSizeBinaryType(fixed_size_binary[16]) - - Wrap an array as an extension array: - - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], - ... pa.binary(16)) # doctest: +SKIP - >>> uuid_type.wrap_array(storage_array) # doctest: +SKIP - - [ - ... - ] - - Or do the same with creating an ExtensionArray: - - >>> pa.ExtensionArray.from_storage(uuid_type, - ... storage_array) # doctest: +SKIP - - [ - ... - ] """ def __cinit__(self):