Skip to content

Commit

Permalink
ENH: add in extension dtype registry (#21185)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Jul 3, 2018
1 parent a70e356 commit 2f14faf
Show file tree
Hide file tree
Showing 24 changed files with 302 additions and 82 deletions.
11 changes: 10 additions & 1 deletion doc/source/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,16 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``.

See the `extension dtype source`_ for interface definition.

.. versionadded:: 0.24.0

:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name.
This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for
example ``'category'`` is a registered string accessor for the ``CategoricalDtype``.

See the `extension dtype dtypes`_ for more on how to register dtypes.

:class:`~pandas.api.extensions.ExtensionArray`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This class provides all the array-like functionality. ExtensionArrays are
limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the
Expand Down Expand Up @@ -179,6 +187,7 @@ To use a test, subclass it:
See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py
for a list of all the tests available.

.. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py
.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py

Expand Down
25 changes: 18 additions & 7 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,23 @@ Previous Behavior:
In [3]: pi - pi[0]
Out[3]: Int64Index([0, 1, 2], dtype='int64')

.. _whatsnew_0240.api.extension:

ExtensionType Changes
^^^^^^^^^^^^^^^^^^^^^

- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
-

.. _whatsnew_0240.api.other:

Other API Changes
^^^^^^^^^^^^^^^^^

.. _whatsnew_0240.api.incompatibilities:

Expand Down Expand Up @@ -168,6 +185,7 @@ Other API Changes
^^^^^^^^^^^^^^^^^

- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`)
-
-

Expand Down Expand Up @@ -344,13 +362,6 @@ Reshaping
-
-

ExtensionArray
^^^^^^^^^^^^^^

- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
-
-

Other
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def _reconstruct_data(values, dtype, original):
"""
from pandas import Index
if is_extension_array_dtype(dtype):
pass
values = dtype.construct_array_type()._from_sequence(values)
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
values = Index(original)._shallow_copy(values, name=None)
elif is_bool_dtype(dtype):
Expand Down Expand Up @@ -705,7 +705,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,

else:

if is_categorical_dtype(values) or is_sparse(values):
if is_extension_array_dtype(values) or is_sparse(values):

# handle Categorical and sparse,
result = Series(values)._values.value_counts(dropna=dropna)
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class ExtensionArray(object):
methods:
* fillna
* dropna
* unique
* factorize / _values_for_factorize
* argsort / _values_for_argsort
Expand Down Expand Up @@ -87,14 +88,16 @@ class ExtensionArray(object):
# Constructors
# ------------------------------------------------------------------------
@classmethod
def _from_sequence(cls, scalars):
def _from_sequence(cls, scalars, copy=False):
"""Construct a new ExtensionArray from a sequence of scalars.
Parameters
----------
scalars : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
copy : boolean, default False
if True, copy the underlying data
Returns
-------
ExtensionArray
Expand Down Expand Up @@ -384,6 +387,16 @@ def fillna(self, value=None, method=None, limit=None):
new_values = self.copy()
return new_values

def dropna(self):
""" Return ExtensionArray without NA values
Returns
-------
valid : ExtensionArray
"""

return self[~self.isna()]

def unique(self):
"""Compute the ExtensionArray of unique values.
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ class ExtensionDtype(_DtypeOpsMixin):
* name
* construct_from_string
Optionally one can override construct_array_type for construction
with the name of this dtype via the Registry
* construct_array_type
The `na_value` class attribute can be used to set the default NA value
for this type. :attr:`numpy.nan` is used by default.
Expand Down Expand Up @@ -156,6 +161,16 @@ def name(self):
"""
raise AbstractMethodError(self)

@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
raise NotImplementedError

@classmethod
def construct_from_string(cls, string):
"""Attempt to construct this type from a string.
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,11 @@ def conv(r, dtype):
def astype_nansafe(arr, dtype, copy=True):
""" return a view if copy is False, but
need to be very careful as the result shape could change! """

# dispatch on extension dtype if needed
if is_extension_array_dtype(dtype):
return dtype.array_type._from_sequence(arr, copy=copy)

if not isinstance(dtype, np.dtype):
dtype = pandas_dtype(dtype)

Expand Down
42 changes: 9 additions & 33 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
PY3, PY36)
from pandas._libs import algos, lib
from pandas._libs.tslibs import conversion

from pandas.core.dtypes.dtypes import (
CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
registry, CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype,
IntervalDtypeType, ExtensionDtype, PandasExtensionDtype)
IntervalDtypeType, ExtensionDtype)
from pandas.core.dtypes.generic import (
ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
Expand Down Expand Up @@ -1977,38 +1978,13 @@ def pandas_dtype(dtype):
np.dtype or a pandas dtype
"""

if isinstance(dtype, DatetimeTZDtype):
return dtype
elif isinstance(dtype, PeriodDtype):
return dtype
elif isinstance(dtype, CategoricalDtype):
return dtype
elif isinstance(dtype, IntervalDtype):
return dtype
elif isinstance(dtype, string_types):
try:
return DatetimeTZDtype.construct_from_string(dtype)
except TypeError:
pass

if dtype.startswith('period[') or dtype.startswith('Period['):
# do not parse string like U as period[U]
try:
return PeriodDtype.construct_from_string(dtype)
except TypeError:
pass

elif dtype.startswith('interval') or dtype.startswith('Interval'):
try:
return IntervalDtype.construct_from_string(dtype)
except TypeError:
pass
# registered extension types
result = registry.find(dtype)
if result is not None:
return result

try:
return CategoricalDtype.construct_from_string(dtype)
except TypeError:
pass
elif isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
# un-registered extension types
if isinstance(dtype, ExtensionDtype):
return dtype

try:
Expand Down
95 changes: 90 additions & 5 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,65 @@
from .base import ExtensionDtype, _DtypeOpsMixin


class Registry(object):
"""
Registry for dtype inference
The registry allows one to map a string repr of a extension
dtype to an extenstion dtype.
Multiple extension types can be registered.
These are tried in order.
Examples
--------
registry.register(MyExtensionDtype)
"""
dtypes = []

@classmethod
def register(self, dtype):
"""
Parameters
----------
dtype : ExtensionDtype
"""
if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)):
raise ValueError("can only register pandas extension dtypes")

self.dtypes.append(dtype)

def find(self, dtype):
"""
Parameters
----------
dtype : PandasExtensionDtype or string
Returns
-------
return the first matching dtype, otherwise return None
"""
if not isinstance(dtype, compat.string_types):
dtype_type = dtype
if not isinstance(dtype, type):
dtype_type = type(dtype)
if issubclass(dtype_type, (PandasExtensionDtype, ExtensionDtype)):
return dtype

return None

for dtype_type in self.dtypes:
try:
return dtype_type.construct_from_string(dtype)
except TypeError:
pass

return None


registry = Registry()


class PandasExtensionDtype(_DtypeOpsMixin):
"""
A np.dtype duck-typed class, suitable for holding a custom dtype.
Expand Down Expand Up @@ -265,6 +324,17 @@ def _hash_categories(categories, ordered=True):
else:
return np.bitwise_xor.reduce(hashed)

@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
from pandas import Categorical
return Categorical

@classmethod
def construct_from_string(cls, string):
""" attempt to construct this type from a string, raise a TypeError if
Expand Down Expand Up @@ -556,11 +626,16 @@ def _parse_dtype_strict(cls, freq):
@classmethod
def construct_from_string(cls, string):
"""
attempt to construct this type from a string, raise a TypeError
if its not possible
Strict construction from a string, raise a TypeError if not
possible
"""
from pandas.tseries.offsets import DateOffset
if isinstance(string, (compat.string_types, DateOffset)):

if (isinstance(string, compat.string_types) and
(string.startswith('period[') or
string.startswith('Period[')) or
isinstance(string, DateOffset)):
# do not parse string like U as period[U]
# avoid tuple to be regarded as freq
try:
return cls(freq=string)
Expand Down Expand Up @@ -660,7 +735,7 @@ def __new__(cls, subtype=None):
try:
subtype = pandas_dtype(subtype)
except TypeError:
raise ValueError("could not construct IntervalDtype")
raise TypeError("could not construct IntervalDtype")

if is_categorical_dtype(subtype) or is_string_dtype(subtype):
# GH 19016
Expand All @@ -682,8 +757,11 @@ def construct_from_string(cls, string):
attempt to construct this type from a string, raise a TypeError
if its not possible
"""
if isinstance(string, compat.string_types):
if (isinstance(string, compat.string_types) and
(string.startswith('interval') or
string.startswith('Interval'))):
return cls(string)

msg = "a string needs to be passed, got type {typ}"
raise TypeError(msg.format(typ=type(string)))

Expand Down Expand Up @@ -727,3 +805,10 @@ def is_dtype(cls, dtype):
else:
return False
return super(IntervalDtype, cls).is_dtype(dtype)


# register the dtypes in search order
registry.register(DatetimeTZDtype)
registry.register(PeriodDtype)
registry.register(IntervalDtype)
registry.register(CategoricalDtype)
2 changes: 1 addition & 1 deletion pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ def astype(self, dtype, copy=True):
@cache_readonly
def dtype(self):
"""Return the dtype object of the underlying data"""
return IntervalDtype.construct_from_string(str(self.left.dtype))
return IntervalDtype(self.left.dtype.name)

@property
def inferred_type(self):
Expand Down
Loading

0 comments on commit 2f14faf

Please sign in to comment.